test1.py 691 B

123456789101112131415161718192021222324
  1. import jieba
  2. from simhash import Simhash
  3. words1 = jieba.lcut('我很想要打游戏,但是女朋友会生气!', cut_all=True)
  4. words2 = jieba.lcut('我很想要打游戏,但是女朋友非常生气!', cut_all=True)
  5. # print(Simhash(words1).distance(Simhash(words2)))
  6. print(Simhash(words1).value)
  7. print(Simhash(words2).value)
  8. #输出:6,因为短文本使用simhash的话,文字稍微有些改动,还是挺明显的,大家可以用长文本尝试
  9. 1495213811346268772
  10. 351422926174413540
  11. 1495213811346268772
  12. 1504362022967304932
  13. 5oiR5b6I5oOz6KaB5omT5ri45oiP77yM5L2G5piv5aWz5pyL5Y+L5Lya55Sf5rCU77yB
  14. 5oiR5b6I5oOz6KaB5omT5ri45oiP77yM5L2G5piv5aWz5pyL5Y+L6Z2e5bi455Sf5rCU77yB