queriesの文章とdocsのファイル内の文の類似度を比較したいのですが、ValueError: only one element tensors can be converted to Python scalars
が出てしまいます。調べてみたのですがわかりませんでした。
ご協力お願いします。```
コード
import
1import numpy as np 2import pandas as pd 3from transformers import AutoModel, AutoTokenizer 4from scipy.spatial.distance import cosine 5 6 7tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-medium") 8model = AutoModel.from_pretrained("rinna/japanese-gpt2-medium") 9 10model.eval() 11 12queries = [ 13 "元気な男の子です。", 14] 15 16docs = pd.read_csv("text.csv",encoding='utf-8') 17target_docs = docs["文章リスト"].tolist() 18 19 20SPECB_QUE_BOS = tokenizer.encode("[", add_special_tokens=False)[0] 21SPECB_QUE_EOS = tokenizer.encode("]", add_special_tokens=False)[0] 22 23SPECB_DOC_BOS = tokenizer.encode("{", add_special_tokens=False)[0] 24SPECB_DOC_EOS = tokenizer.encode("}", add_special_tokens=False)[0] 25 26 27def tokenize_with_specb(texts, is_query): 28 29 batch_tokens = tokenizer(texts, padding=False, truncation=True) 30 31 for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]): 32 if is_query: 33 seq.insert(0, SPECB_QUE_BOS) 34 seq.append(SPECB_QUE_EOS) 35 else: 36 seq.insert(0, SPECB_DOC_BOS) 37 seq.append(SPECB_DOC_EOS) 38 att.insert(0, 1) 39 att.append(1) 40 41 batch_tokens = tokenizer.pad(batch_tokens, padding=True, return_tensors="pt") 42 return batch_tokens 43 44def get_weightedmean_embedding(batch_tokens, model): 45 46 with torch.no_grad(): 47 48 last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state 49 50 51 weights = ( 52 torch.arange(start=1, end=last_hidden_state.shape[1] + 1) 53 .unsqueeze(0) 54 .unsqueeze(-1) 55 .expand(last_hidden_state.size()) 56 .float().to(last_hidden_state.device) 57 ) 58 59 60 input_mask_expanded = ( 61 batch_tokens["attention_mask"] 62 .unsqueeze(-1) 63 .expand(last_hidden_state.size()) 64 .float() 65 ) 66 67 68 sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1) 69 sum_mask = torch.sum(input_mask_expanded * weights, dim=1) 70 71 embeddings = sum_embeddings / sum_mask 72 73 return embeddings 74 75 76query_embeddings = get_weightedmean_embedding(tokenize_with_specb(queries, is_query=True), model) 77doc_embeddings = get_weightedmean_embedding(tokenize_with_specb(target_docs, is_query=False), model) 78 79 80 81cosine_sim = 1 - cosine([query_embeddings[0]], doc_embeddings[1:]) 82 83docs["類似度"] = cosine_sim[0] 84print(docs.sort_values("類似度",ascending=False))
回答1件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。