word_ngram 模型使用小tip

时间:2025-05-10 07:38:59
  • import
  • import pandas
  • #vocabulary 中不存在的单词计算,切分字符求平均计算余弦相似度
  • # 本方法,使用wiki word+char模型,计算PKU 500词语相似,在wiki中的词汇
  • def cha_w2v(s):
  • res = []
  • for i in s:
  • print(i, model[i])
  • (model[i])
  • return sum(res)/len(res)
  • def Cosine(vec1, vec2):
  • npvec1, npvec2 = (vec1), (vec2)
  • return (npvec2)/(((npvec1**2).sum()) * ((npvec2**2).sum()))
  • def word2vec(word1, word2):
  • import gensim
  • model = .load_word2vec_format(r'D:\paper\github\baike\', binary=False)
  • try:
  • return (word1, word2) # 两者的相似度
  • except Exception as e:
  • print(e)
  • def open_file(path, path1):
  • result = []
  • res = []
  • with open(path, 'r') as f:
  • result = ()
  • with open(path1, "r") as f1:
  • res = ()
  • # print(result, res)
  • for i in range(len(result)):
  • print(word2vec(result[i].replace("\n", ""), res[i].replace("\n", "")))
  • if __name__ == "__main__":
  • path = r"D:\three-graduate\github\PKU\test_0.txt"
  • path1 = r"D:\three-graduate\github\PKU\test_1.txt"
  • open_file(path, path1)