LSI文本相似度计算

Posted by zluckyH on September 8, 2017

本文通过两种方式构造了LSI模型来计算文本相似度,主要参考资料如下:

1.利用python gensim包构建LSI模型

# -*- coding:utf-8 -*-

# 利用lsi模型计算文档相似度
from gensim import corpora, models, similarities
import logging

logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)

documents = ["Shipment of gold damaged in a fire", "Delivery of silver arrived in a silver truck",
             "Shipment of gold arrived in a truck"]

# 对每篇文档进行分词
texts = [[word for word in documents.lower().split()] for documents in documents]

# 建立词典,将所有文档中出现的词都放到词典中,并建立词-id映射
dictionary = corpora.Dictionary(texts)
dictionary.token2id  # 词与id 的映射

corpus = [dictionary.doc2bow(text) for text in texts]  # 将每篇文档以id-词频向量点形式表示

tfidf = models.TfidfModel(corpus)  # 建立tf-idf模型,计算每个词的tf和idf

tfidf.dfs # 查看每个词的df值
tfidf.idfs # 查看每个词的idf值
corpus_tfidf = tfidf[corpus]  # 将上述以词频表示的文档表示为以tf-idf值表示的文档向量
for doc in corpus_tfidf:
    print(doc)

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)  # 根据上述建立的tf-idf值表示的文档向量(矩阵),建立lsi模型,并假设topic数为2
lsi.print_topics(2)
corpus_lsi = lsi[corpus_tfidf]  # 将文档映射到2维的topic空间中
for doc in corpus_lsi:
    print(doc)
# 同样,可以构建lda模型
# lda =  models.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=2)
# lda.print_topics(2)

index = similarities.MatrixSimilarity(lsi[corpus])  # 创建索引,用于后续计算相似度
query = 'gold silver truck'
query_bow = dictionary.doc2bow(query.lower().split())  # 将查询文档向量化
print(query_bow)
query_lsi = lsi[query_bow]  # 用之前训练好的lsi模型将其映射到2维topic空间
print(query_lsi)
sims = index[query_lsi]  # 计算query和index中的文档点相似度
print(list(enumerate(sims)))
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sort_sims)

2.利用numpy.linalg.svd 进行SVD分解,构造LSI模型

import numpy as np

A = np.mat([[1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0, 1, 1, 0, 2, 1],
            [1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1]]).transpose()
q = np.mat([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]).transpose()
u, sigma, vt = np.linalg.svd(A)  # svd分解,获得u,sigma,v

# Implement a Rank 2 Approximation
u1 = u[:, :2]
s1 = sigma[:2]
v1 = vt.transpose()[:, :2].transpose()
# individual document vectors
d1 = v1[:, 0].transpose()
d2 = v1[:, 1].transpose()
d3 = v1[:, 2].transpose()
# Find the new query vector coordinates in the reduced 2-dimensional space.
q1 = q.T * u1 * np.mat(np.diag(s1)).I


# calculate the cos similarity of two documents
def cos_sim(x, y):
    num = x * y.T
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    cos = num / denom
    return cos


cos_sim(d1, q1)
cos_sim(d2, q1)
cos_sim(d3, q1)