本文通过两种方式构造了LSI模型来计算文本相似度,主要参考资料如下:
- 如何计算两个文档的相似度(一)
- 如何计算两个文档的相似度(二)
- singular-value-decomposition-fast-track-tutorial
- latent-semantic-indexing-fast-track-tutorial
1.利用python gensim
包构建LSI模型
# -*- coding:utf-8 -*-
# 利用lsi模型计算文档相似度
from gensim import corpora, models, similarities
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
documents = ["Shipment of gold damaged in a fire", "Delivery of silver arrived in a silver truck",
"Shipment of gold arrived in a truck"]
# 对每篇文档进行分词
texts = [[word for word in documents.lower().split()] for documents in documents]
# 建立词典,将所有文档中出现的词都放到词典中,并建立词-id映射
dictionary = corpora.Dictionary(texts)
dictionary.token2id # 词与id 的映射
corpus = [dictionary.doc2bow(text) for text in texts] # 将每篇文档以id-词频向量点形式表示
tfidf = models.TfidfModel(corpus) # 建立tf-idf模型,计算每个词的tf和idf
tfidf.dfs # 查看每个词的df值
tfidf.idfs # 查看每个词的idf值
corpus_tfidf = tfidf[corpus] # 将上述以词频表示的文档表示为以tf-idf值表示的文档向量
for doc in corpus_tfidf:
print(doc)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 根据上述建立的tf-idf值表示的文档向量(矩阵),建立lsi模型,并假设topic数为2
lsi.print_topics(2)
corpus_lsi = lsi[corpus_tfidf] # 将文档映射到2维的topic空间中
for doc in corpus_lsi:
print(doc)
# 同样,可以构建lda模型
# lda = models.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=2)
# lda.print_topics(2)
index = similarities.MatrixSimilarity(lsi[corpus]) # 创建索引,用于后续计算相似度
query = 'gold silver truck'
query_bow = dictionary.doc2bow(query.lower().split()) # 将查询文档向量化
print(query_bow)
query_lsi = lsi[query_bow] # 用之前训练好的lsi模型将其映射到2维topic空间
print(query_lsi)
sims = index[query_lsi] # 计算query和index中的文档点相似度
print(list(enumerate(sims)))
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sort_sims)
2.利用numpy.linalg.svd
进行SVD分解,构造LSI模型
import numpy as np
A = np.mat([[1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0], [1, 1, 0, 1, 0, 0, 1, 1, 0, 2, 1],
[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1]]).transpose()
q = np.mat([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]).transpose()
u, sigma, vt = np.linalg.svd(A) # svd分解,获得u,sigma,v
# Implement a Rank 2 Approximation
u1 = u[:, :2]
s1 = sigma[:2]
v1 = vt.transpose()[:, :2].transpose()
# individual document vectors
d1 = v1[:, 0].transpose()
d2 = v1[:, 1].transpose()
d3 = v1[:, 2].transpose()
# Find the new query vector coordinates in the reduced 2-dimensional space.
q1 = q.T * u1 * np.mat(np.diag(s1)).I
# calculate the cos similarity of two documents
def cos_sim(x, y):
num = x * y.T
denom = np.linalg.norm(x) * np.linalg.norm(y)
cos = num / denom
return cos
cos_sim(d1, q1)
cos_sim(d2, q1)
cos_sim(d3, q1)