import urllib.request data=urllib.request.urlopen("http://127.0.0.1/txt1.txt").read().decode("utf-8","ignore") word10=jieba.analyse.extract_tags(data,20) print(word10) import gensim from gensim import corpora,models,similarities import jieba import urllib.request data1=data.replace('\t', '').replace('\n', '').replace(' ','') data2=urllib.request.urlopen("http://127.0.0.1/comment.txt").read().decode("utf-8","ignore").replace('\t', '').replace('\n', '').replace(' ','') d1=jieba.cut(data1) d2=jieba.cut(data2) data01 = "" for item in d1 : #print (item) data01 += item+ " " data11=data01.replace(",","") data21 = "" for item in d2: data21 += item + " " data22=data21.replace(',','') documents = [data11, data22] print(documents) from collections import defaultdict texts=[[word for word in document.split()] for document in documents] print (texts) frequency=defaultdict(int) for text in texts: for token in text: frequency[token]+=1 #texts=[[word for word in text if frequency[token]>3] # for text in texts] dictionary=corpora.Dictionary(texts) dictionary.save("C:/Users/Administrator/Desktop/tripadvisor_gm/tripadvisor_code_python/test_dict1.txt") data3=data=urllib.request.urlopen("http://127.0.0.1/txt2.txt").read().decode("utf-8","ignore") d3=jieba.cut(data2) data31 = "" for item in d3 : #print (item) data31 += item+ " " data31=data31.replace(",","") new_doc=data31 new_vec=dictionary.doc2bow(new_doc.split()) corpus=[dictionary.doc2bow(text)for text in texts] corpora.MmCorpus.serialize("C:/Users/Administrator/Desktop/tripadvisor_gm/tripadvisor_code_python/test_corpus1.txt",corpus) tfidf=models.TfidfModel(corpus) feature_num=len(dictionary.token2id.keys()) index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=feature_num) sim=index[tfidf[new_vec]] print(sim) #word1 word2 word3...wordn #小说推荐。。。 #自动匹配推荐。。。