編集履歴

質問編集履歴

2016/01/24 08:28

投稿

kamatmt

スコア25

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -836,7 +836,7 @@
 for n in 1..fb do
-	si[n]="/usr/home/hakamata/ru/tweet_html/si#{n}.txt"
+	si[n]="/usr/home/ru/tweet_html/si#{n}.txt"
 end
@@ -848,7 +848,7 @@
 for n in 1..fa do
-	chan[n]="/usr/home/hakamata/ru/tweet_html/r#{n}.txt"
+	chan[n]="/usr/home/ru/tweet_html/r#{n}.txt"
 end

追加

2016/01/24 08:28

投稿

kamatmt

スコア25

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -49,3 +49,871 @@
 考えられることとしてどういったことがありますか？
+追加.........................................................
+元のプログラムです。
+長く汚いプログラムで申し訳ないです。
+116:in `*': nil can't be coerced into Float (TypeError)
+このようなエラーがでています。
+最終的には類以度を出したいです。
+手詰まり状態で困っています。
+```ruby
+#encoding: utf-8
+require "MeCab"
+require "kconv"
+require "nkf"
+###########
+def hinsi(sentence)
+    mecab = MeCab::Tagger.new
+	node = mecab.parseToNode(sentence)
+	word = []
+	begin
+		node = node.next
+					data="#{node.surface.toutf8},#{node.feature.toutf8}"
+			if data.split(",")[1].toutf8 == "名詞" then
+					if data.split(",")[2].toutf8 =~ /一般|固有名詞/ then
+						word.push("#{node.surface.toutf8}")
+					end
+			end
+	 end until node.next.feature.include?("BOS/EOS")
+return word
+end
+###################
+#出現回数
+def cou(ta)
+hash = Hash.new
+q =0
+ta.each{|line|
+	array = line.split("\t")
+	tarm = array[0]
+	q += 1
+	if hash.include?(tarm) then
+		hash[tarm] += 1
+	else
+		hash[tarm] = 1
+	end
+#	print array[0], "\n"
+}
+a = hash.values
+b = hash.keys
+    return a,q.round(3),b
+end
+#####################
+# tf
+def tf_(situgen,meitan)
+c = situgen.length   #単語の種類の数
+tf = []
+for n in 0..c-1 do
+	tf[n] = situgen[n] / meitan
+	tf[n] = tf[n].round(3)
+end
+return tf
+end
+##################
+#df
+def df_(w,at)
+hash = Hash.new
+q =0
+for n in w do
+	for m in at do
+		pos=m.include?(n)
+		if pos == true then
+		  array = n.split("\t")
+      	  tarm = array[0]
+	      q += 1
+	      if hash.include?(tarm) then
+		     hash[tarm] += 1
+	      else
+		     hash[tarm] = 1
+	      end
+        end
+    end
+end
+a = hash.keys
+b = hash.values
+return a,b
+end
+####################
+#idf
+def idf_(df_t,alt)
+c=df_t.length
+d=[]
+idf=[]
+for i in 0..c-1 do
+  d[i]=df_t[i].round(2)
+  idf[i] = Math.log(alt.length/d[i])+1
+  idf[i]= idf[i].round(3)
+end
+return idf
+end
+###################
+#tf-idf
+def tfidf_(tf_t,idf_t)
+t = tf_t.length
+tfidf=[]
+for i in 0..t-1 do
+	tfidf[i]=tf_t[i] * idf_t[i]
+	tfidf[i]=tfidf[i].round(3)
+end
+return tfidf
+end
+############################
+#単語ごとのベクトル
+def coscount(b1,b2,co1,co2)
+al=[]
+al += b1
+al += b2
+a = al.uniq #種類の数
+x = b1.length
+y = b2.length
+z = a.length
+array1 =[]
+for i in 0..z-1 do
+	array1[i] = 0
+end
+array2 =[]
+for i in 0..z-1 do  #s：種類の数
+	array2[i] = 0
+end
+for n in  0..z-1 do
+	for m in 0..x-1 do
+		if a[n] == b1[m] then
+			array1[n]=co1[m]
+		end
+	end
+	for l in 0..y-1 do
+		if a[n] == b2[l] then
+			array2[n]=co2[l]
+		end
+	end
+end
+return a,array1,array2
+end
+####################################
+  #類似度
+  def cosine_similarity(vector1,vector2)
+    dp = dot_product(vector1, vector2)
+    nm = normalize(vector1) * normalize(vector2)
+    dp / nm
+  end
+  def dot_product(vector1, vector2)
+    sum = 0.0
+    vector1.each_with_index{ |val, i| sum += val*vector2[i] }
+    sum
+  end
+  def normalize(vector)
+    Math.sqrt(vector.inject(0.0){ |m,o| m += o**2 })
+  end
+#####################
+def main(t1,t2,k1,k2)
+#形態素解析a
+w1=[]
+for  j in 1..k1-1 do
+	if t1[j] != nil then
+	w1[j]=hinsi(t1[j])
+	end
+end
+#形態素解析b
+w2=[]
+for  j in 1..k2-1 do
+	if t2[j] != nil then
+	w2[j]=hinsi(t2[j])
+	end
+end
+####################
+#出現回数a
+c1 =[]
+ac1= []
+bc1=[]
+for j in 0..k1-1 do
+	if w1[j] != nil then
+		c1[j],ac1[j],bc1[j] = cou(w1[j])
+	end
+end
+#出現回数b
+c2 =[]
+ac2= []
+bc2=[]
+for j in 0..k2-1 do
+	if w2[j] != nil then
+	c2[j],ac2[j],bc2[j] = cou(w2[j])
+	end
+end
+########################
+#tf a
+tf1=[]
+for j in 0..k1-1 do
+	if c1[j] != nil then
+	tf1[j] = tf_(c1[j],ac1[j])
+	end
+end
+#tf b
+tf2=[]
+for j in 0..k2-1 do
+	if c2[j] != nil then
+	tf2[j] = tf_(c2[j],ac2[j])
+	end
+end
+###############################
+#df a
+key1=[]
+df1 =[]
+for j in 0..k1-1 do
+	if w1[j] != nil then
+	key1[j],df1[j] = df_(w1[j],t1) #key1:単語 df1:ある単語がでる文書の数
+    end
+end
+#df b
+key2=[]
+df2 =[]
+for j in 0..k2-1 do
+	if w2[j] != nil then
+	key2[j],df2[j] = df_(w2[j],t2)
+    end
+end
+#################################
+#idf a
+idf1=[]
+for j in 0..k1-1 do
+	if df1[j] != nil then
+	idf1[j] = idf_(df1[j],t1)
+	end
+end
+#idf b
+idf2=[]
+for j in 0..k2-1 do
+	if df2[j] != nil then
+	idf2[j] = idf_(df2[j],t2)
+	end
+end
+##########################
+#tfidf a
+tfidf1=[]
+for j in 0..k1-1 do
+	if tf1[j] != nil && idf1[j] != nil then
+	tfidf1[j] = tfidf_(tf1[j],idf1[j])
+	end
+end
+tfidf2=[]
+for j in 0..k2-1 do
+	if tf1[j] != nil && idf1[j] != nil then
+	tfidf2[j] = tfidf_(tf2[j],idf2[j])
+	end
+end
+#puts tfidf1
+#puts tfidf2,key2
+#######################
+#カウント
+l=0
+cosrui=[]
+a=[]
+ara1=[]
+ara2=[]
+for j in 0..k1-1 do
+	for i in 0..k2-1 do
+		a[l],ara1[l],ara2[l] =coscount(bc1[j],bc2[i],tfidf1[j],tfidf2[i])
+		l+=1
+	end
+end
+#類似度
+for j in 0..l-1 do
+	cosrui[j] = cosine_similarity(ara1[j],ara2[j])
+	#h.puts "類似度",cosrui[j]
+end
+#####################
+z = 0
+cos=[]
+if cosrui[0]>=cosrui[1] then
+	cos=cosrui[0]
+	z = 0
+else
+	cos=cosrui[1]
+	z = 0
+end
+for n in 2..j-1 do
+	if cos <= cosrui[n] then
+		cos=cosrui[n]
+		z = n
+	end
+end
+return cos
+end
+##########################################
+#ファイル入力
+def read_data_file(filename)
+  sentence=[]
+	f=File.open("#{filename}","r:UTF-8")
+	f.each{|data|
+		sentence.push(data)
+	}
+k= sentence.length
+t=[]
+for j in 0..k-1 do
+   t[j]=NKF.nkf("-Xw",sentence[j])
+end
+t.uniq!
+return t,k
+end
+#ファイル数
+puts "ファイルb"
+fb=gets.to_i #
+puts "ファイルa"
+fa=gets.to_i #
+si=[]
+for n in 1..fb do
+	si[n]="/usr/home/hakamata/ru/tweet_html/si#{n}.txt"
+end
+chan=[]
+for n in 1..fa do
+	chan[n]="/usr/home/hakamata/ru/tweet_html/r#{n}.txt"
+end
+##############
+a=[]
+b=[]
+i=1
+data=[]
+cosr=[]
+for n in 1..fb do
+	for m in 1..fa do
+		if si[n] != nil && chan[m] != nil then
+	d1,ke1 = read_data_file(si[n])
+	d2,ke2 = read_data_file(chan[m])
+	cosr[i]=main(d1,d2,ke1,ke2)
+	a[i]=n
+	b[i]=m
+	i+=1
+	end
+	end
+end
+	h=File.open("w.txt","w:UTF-8")
+for n in 1..i-1 do
+	h.puts "類似度",cosr[n]
+	h.puts a[n],b[n],"\n"
+end
+```

2016/01/24 08:27

投稿

kamatmt

スコア25

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -6,23 +6,31 @@
 計算の途中経過として
 ①
-0.045
-0.045
+0.045,
+0.045,
-0.111
+0.111,
 ...
 Ⅰ
-5.575
-3.01
+5.575,
+3.01,
-3.625
+3.625,
 ...