好的,我已经搜索了网络,下面是一些用 Java 在 Windows 环境下进行语义相似度比较的方法,并且只能运行 Java 代码:
方法:
Java 代码示例 (使用余弦相似度和TF-IDF):
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class SemanticSimilarity {
public static void main(String[] args) throws IOException {
String text1 = "猫坐在垫子上。";
String text2 = "垫子上有一只猫。";
// 1. 预处理
List<String> tokens1 = tokenize(text1);
List<String> tokens2 = tokenize(text2);
// 2. 计算 TF-IDF 向量
Map<String, Double> tfidf1 = calculateTfIdf(tokens1, Arrays.asList(tokens1, tokens2));
Map<String, Double> tfidf2 = calculateTfIdf(tokens2, Arrays.asList(tokens1, tokens2));
// 3. 计算余弦相似度
double similarity = cosineSimilarity(tfidf1, tfidf2);
System.out.println("相似度: " + similarity);
}
// 分词 (简单的空格分割)
public static List<String> tokenize(String text) {
return Arrays.asList(text.split("\\s+"));
}
// 计算 TF-IDF
public static Map<String, Double> calculateTfIdf(List<String> tokens, List<List<String>> allDocuments) {
Map<String, Double> tfidf = new HashMap<>();
Map<String, Integer> termFrequency = new HashMap<>();
for (String token : tokens) {
termFrequency.put(token, termFrequency.getOrDefault(token, 0) + 1);
}
for (String token : termFrequency.keySet()) {
double tf = (double) termFrequency.get(token) / tokens.size();
double idf = Math.log((double) allDocuments.size() / (1 + documentFrequency(token, allDocuments)));
tfidf.put(token, tf * idf);
}
return tfidf;
}
// 计算包含词语的文档数量
private static int documentFrequency(String term, List<List<String>> allDocuments) {
int count = 0;
for (List<String> document : allDocuments) {
if (document.contains(term)) {
count++;
}
}
return count;
}
// 计算余弦相似度
public static double cosineSimilarity(Map<String, Double> vector1, Map<String, Double> vector2) {
Set<String> allTerms = new HashSet<>();
allTerms.addAll(vector1.keySet());
allTerms.addAll(vector2.keySet());
double dotProduct = 0;
double magnitude1 = 0;
double magnitude2 = 0;
for (String term : allTerms) {
double value1 = vector1.getOrDefault(term, 0.0);
double value2 = vector2.getOrDefault(term, 0.0);
dotProduct += value1 * value2;
magnitude1 += Math.pow(value1, 2);
magnitude2 += Math.pow(value2, 2);
}
magnitude1 = Math.sqrt(magnitude1);
magnitude2 = Math.sqrt(magnitude2);
if (magnitude1 == 0 || magnitude2 == 0) {
return 0;
}
return dotProduct / (magnitude1 * magnitude2);
}
}
要点:
引用: