词向量转换成句向量的文本相似度计算
# coding: utf-8
# In[2]:
###读取已训练好的词向量
from gensim.models import word2vec
w2v=word2vec.Word2Vec.load("d:/chat_data/corpus_vector.model")
##对文本进行分词
import jieba
import re
raw_data = []
w = open("******","r",encoding= "utf-8")
for line in w.readlines():
newline = line.strip()
newline = re.sub(" ","",newline)
newline = jieba.cut(newline)
raw_data.append(list(newline))
w.close()
import numpy as np
# In[72]:
###转换成句向量
def sent2vec(s):
words = s
M = []
for w in words:
try:
M.append(w2v.wv[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
return v / np.sqrt((v ** 2).sum())
newdata = []
newdata_dict = {}
seed = 0
for word in raw_data:
try:
newline = sent2vec(word)
if len(newline)<300:
continue
newdata.append(newline)
times += 1
newdata_dict[tuple(newline)] = "".join(word)
except:
continue
####另外一个文本集
ws = open("******","r",encoding="gbk")
times = 0
import re
import jieba
standard_data = []
from zhon.hanzi import punctuation
for i in ws.readlines():
times += 1
if times == 1:
continue
newline = i.strip().split(",")
newline = re.sub("[A-Za-z0-9[`~!@#$^&*()=|{}":;",[].<>/?~!@#\&*\%-\_]", "", newline[0])
newline = re.sub(" ","",newline)
newline = re.sub("[%s]+" %punctuation, "", newline)
standard_data.append(list(jieba.cut(newline)))
ws.close()
standard_newdata = []
new_standard_dict = {}
for word in standard_data:
try:
newline = sent2vec(word)
if len(newline)<300:
continue
standard_newdata.append(newline)
new_standard_dict[tuple(newline)] = "".join(word)
except:
continue
# In[45]:
from gensim import corpora, models, similarities
# In[114]:
####计算余弦夹角
import math
def cos_dist(a, b):
if len(a) != len(b):
return None
part_up = 0.0
a_sq = 0.0
b_sq = 0.0
for a1, b1 in zip(a,b):
part_up += a1*b1
a_sq += a1**2
b_sq += b1**2
part_down = math.sqrt(a_sq*b_sq)
if part_down == 0.0:
return None
else:
return part_up / part_down
# In[ ]:
####计算两个文本集的相似度
result_data = {}
for i in standard_newdata:
for j in newdata:
result_data[new_standard_dict[tuple(i)]+" "+newdata_dict[tuple(j)]] = cos_dist(i,j)
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: 一段简单实现【余弦相似度】的python代码
- 下一篇: R语言学习笔记-概率函数