SentenceTransformer简单使用
1 SentenceTransformer介绍
SentenceTransformer主要用于对句子、文本和图像进行嵌入。可用于文本和图像的相似度对比查找等
# SentenceTransformer官网地址
/
# 安装SentenceTransformer
pip install -U sentence-transformers -i /simple
2 基本使用
2.1 实现文本的嵌入
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
# Our sentences we like to encode
sentences = [
"This framework generates embeddings for each input sentence",
"Sentences are passed as a list of string.",
"The quick brown fox jumps over the lazy dog.",
]
# Sentences are encoded by calling ()
sentence_embeddings = model.encode(sentences)
# Print the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
print("Sentence:", sentence)
print("Embedding:", embedding)
2.2 比较两个句子的相似度
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer("all-MiniLM-L6-v2")
# Sentences are encoded by calling ()
emb1 = ("This is a red cat with a hat.")
emb2 = ("Have you seen my red cat?")
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)
2.3 查找相似度较大的文本
它还可以跟Elasticsearch结合使用。
from sentence_transformers import SentenceTransformer, util
import torch
embedder = SentenceTransformer("E:\\model\\distiluse-base-multilingual-cased-v1")
# Corpus with example sentences
corpus = [
"A man is eating food.",
"A man is eating a piece of bread.",
"The girl is carrying a baby.",
"A man is riding a horse.",
"A woman is playing violin.",
"Two men pushed carts through the woods.",
"A man is riding a white horse on an enclosed ground.",
"A monkey is playing drums.",
"A cheetah is running behind its prey.",
]
corpus_embeddings = (corpus, convert_to_tensor=True)
# Query sentences:
queries = [
"A man is eating pasta.",
"Someone in a gorilla costume is playing a set of drums.",
"A cheetah chases prey on across a field.",
]
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
query_embedding = (query, convert_to_tensor=True)
# We use cosine-similarity and to find the highest 5 scores
cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
top_results = (cos_scores, k=top_k)
print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")
for score, idx in zip(top_results[0], top_results[1]):
print(corpus[idx], "(Score: {:.4f})".format(score))