关于文档相似度计算python代码求助
在网上找了一段计算多个文档相似度的python代码,但怎么都运行不成功,请有经验的人士帮助分析一下,错在什么地方,如何解决。十分感谢!代码如下:import jieba
import docx
import os
import numpy as np
import re
import xlwt
from gensim import corpora, models, similarities
path="D:\QQFile\sx1"
files=os.listdir(path)
print(files)
texts=[]
for file in files:
f = docx.Document(path+'/'+ file)
text = ''
for para in f.paragraphs:
text += para.text
texts.append(text)
a=[]
for text in texts:
b=[word for word in jieba.cut(text)]
a.append(b)
print("正在测试")
ff=[]
for i in range(len(a)):
test_list=a[i]
dictionary=corpora.Dictionary(a)
# dictionary.keys()
# dictionary.token2id
corpus=[dictionary.doc2bow(doc) for doc in a]
doc_test_vec = dictionary.doc2bow(test_list)
tfidf = models.TfidfModel(corpus)
tfidf[doc_test_vec]
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
sim = index[tfidf[doc_test_vec]]
for j in range(len(sim)):
print('与第',i,'个文件对比:','第', j, '文件的相似度为:', sim[j])
print(sorted(enumerate(sim), key=lambda item: -item[1]))
ff.append(sim)
print(len(ff))
xls=xlwt.Workbook()
sht1=xls.add_sheet('sheet1')
file_studentname=[]
for studentname in files:
studentnames=re.sub(".docx","",studentname)
file_studentname.append(studentnames)
# for i in range(len(ff)):
for k,filename in zip(np.arange(len(files)),file_studentname):
sht1.write(int(k+1),0,filename)
sht1.write(0,int(k+1) , filename)
for i in range(len(ff)):
for j in range(len(ff[i])):
h=ff[i].tolist()
sht1.write(j+1,i+1,float(h[j]))
xls.save('C:\Users\Administrator\Desktop\相似度.xls')
运行后的出现的信息如下:
==================== RESTART: D:\Python27\ldjtest\ldj3.py ====================
['\xc8\xce\xe5\xfb\xe6\xc3.docx', '\xcb\xef\xba\xae\xb1\xf9.docx', '\xcb\xef\xd1\xd2.docx', '\xd5\xc5\xe7\xb2\xf6\xa9.docx']
Building prefix dict from the default dictionary ...
Dumping model to file cache c:\docume~1\admini~1\locals~1\temp\jieba.cache
Loading model cost 18.594 seconds.
Prefix dict has been built succesfully.
('\xd3\xeb\xb5\xda', 0, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 0, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 1.0000005)
('\xd3\xeb\xb5\xda', 0, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 1, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.027729953)
('\xd3\xeb\xb5\xda', 0, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 2, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.02634788)
('\xd3\xeb\xb5\xda', 0, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 3, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.197953)
[(0, 1.0000005), (3, 0.197953), (1, 0.027729953), (2, 0.02634788)]
('\xd3\xeb\xb5\xda', 1, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 0, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.027729953)
('\xd3\xeb\xb5\xda', 1, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 1, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 1.000001)
('\xd3\xeb\xb5\xda', 1, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 2, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.035550788)
('\xd3\xeb\xb5\xda', 1, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 3, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.041584983)
[(1, 1.000001), (3, 0.041584983), (2, 0.035550788), (0, 0.027729953)]
('\xd3\xeb\xb5\xda', 2, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 0, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.02634788)
('\xd3\xeb\xb5\xda', 2, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 1, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.035550788)
('\xd3\xeb\xb5\xda', 2, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 2, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 1.0000013)
('\xd3\xeb\xb5\xda', 2, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 3, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.062224135)
[(2, 1.0000013), (3, 0.062224135), (1, 0.035550788), (0, 0.02634788)]
('\xd3\xeb\xb5\xda', 3, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 0, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.197953)
('\xd3\xeb\xb5\xda', 3, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 1, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.041584983)
('\xd3\xeb\xb5\xda', 3, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 2, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 0.062224135)
('\xd3\xeb\xb5\xda', 3, '\xb8\xf6\xce\xc4\xbc\xfe\xb6\xd4\xb1\xc8\xa3\xba', '\xb5\xda', 3, '\xce\xc4\xbc\xfe\xb5\xc4\xcf\xe0\xcb\xc6\xb6\xc8\xce\xaa\xa3\xba', 1.0000046)
[(3, 1.0000046), (0, 0.197953), (2, 0.062224135), (1, 0.041584983)]
4
Traceback (most recent call last):
File "D:\Python27\ldjtest\ldj3.py", line 57, in <module>
xls.save('C:\Users\Administrator\Desktop\相似度.xls')
File "D:\Python27\lib\site-packages\xlwt\Workbook.py", line 710, in save
doc.save(filename_or_stream, self.get_biff_data())
File "D:\Python27\lib\site-packages\xlwt\Workbook.py", line 674, in get_biff_data
shared_str_table = self.__sst_rec()
File "D:\Python27\lib\site-packages\xlwt\Workbook.py", line 636, in __sst_rec
return self.__sst.get_biff_record()
File "D:\Python27\lib\site-packages\xlwt\BIFFRecords.py", line 77, in get_biff_record
self._add_to_sst(s)
File "D:\Python27\lib\site-packages\xlwt\BIFFRecords.py", line 92, in _add_to_sst
u_str = upack2(s, self.encoding)
File "D:\Python27\lib\site-packages\xlwt\UnicodeUtils.py", line 50, in upack2
us = unicode(s, encoding)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc8 in position 0: ordinal not in range(128)
请各位大侠帮忙分析一下,该如何解决这个问题。几个文档是DOCX格式的,都放在一个文件夹里的。