java tf-idf値を計算するプログラム

if-idf値を計算し、ファイルに書き出すプログラムを書いているのですが、ファイルに出力される値がおかしくなってしまいます。

5つの入力ファイルから単語を読み取り、単語の出現回数や特定の単語が5つのファイルの内いくつのファイルに存在するかなどを数えて、tf-idf値の上位5つの値をファイルに書き出したいです。

何度もプログラムを見返してみたのですが、どこが悪いのかわかりません、、
wordFreqMapは単語とその出現回数、sourceWordCountMapはソースファイルとWordCountのインスタンス、docFreqMapは単語とその単語が含まれるファイルの数保管するものです。

プログラミングが得意ではなくて、質問の仕方もわかりにくいとは思いますが、よろしくお願いいたします、、、

java
1package p7;
2
3import java.io.*;
4import java.lang.*;
5import java.util.*;
6
7import java.util.Map.Entry;
8
9//for 7th week
10public class WordCount {
11
12	private File sourceFile;
13	private int total;
14	private HashMap<String, Integer> wordFreqMap;
15	// for 7th week
16	private HashMap<String, Double> tfidfWeight;
17	private static HashMap<String, WordCount> sourceWordCountMap = new HashMap<>();
18	private static HashMap<String, Integer> docFreqMap = new HashMap<>();
19
20	// for 7th week: Exercise 7-1
21	// Initialize sourceFile, total, wordFreqMap, and tfidfWeight fields.
22	WordCount(File sourceFile) {
23		setSourceFile(sourceFile);
24		total=0;
25		wordFreqMap=new HashMap<>();
26		tfidfWeight=new HashMap<>();
27	}
28
29	// for 7th week: Exercise 7-1
30	String getSourceName() {
31		return sourceFile.getName();
32	}
33
34	// for 7th week: Exercise 7-1
35	void setSourceFile(File sourceFile) {
36		this.sourceFile=sourceFile;
37	}
38
39	// for 6th week
40	// Copy the code from TextInfo.java
41	private void countFreq(String word1) {
42		String[] words = word1.split(" ");
43        for (String word : words) {
44            if (!word.isEmpty()) {
45                if (wordFreqMap.containsKey(word)) {
46                    int count = wordFreqMap.get(word) + 1;
47                    wordFreqMap.put(word, count);
48                } else {
49                	wordFreqMap.put(word, 1);
50                }
51            }
52        }
53        List<Entry<String,Integer>> entries=new ArrayList<>(wordFreqMap.entrySet());
54		Collections.sort(entries,new Comparator<>() {
55			public int compare(Entry<String,Integer> obj1,Entry<String,Integer> obj2) {
56				return obj2.getValue().compareTo(obj1.getValue());
57			}
58		});
59	}
60
61	// for 7th week: Exercise 7-1
62	void readFile() {
63		try {
64			InputStreamReader isr = new InputStreamReader(new FileInputStream(sourceFile), "Shift-JIS");
65			BufferedReader br = new BufferedReader(isr);
66			String s, token[];
67			while ((s = br.readLine()) != null) {
68				if (s.isEmpty()) {
69					continue;
70				}
71				countFreq(s);
72			}
73			br.close();
74		} catch (Exception e) {
75			e.printStackTrace();
76		}
77	}
78
79	// for 6th week
80	// Copy the code from TextInfo.java
81	int getFreq(String word) {
82		return wordFreqMap.get(word);
83	}
84
85	// for 7th week: Exercise 7-2
86	// Returns the set of words stored in its instance.
87	Set<String> getWordSet() {
88		return this.wordFreqMap.keySet();
89	}
90
91	// for 7th week: Exercise 7-1
92	// Writes the top n words in frequency (together with their frequency).
93	void writeTopNFrequentWords(int n, File outputFile) {
94		try {
95			OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(outputFile), "Shift-JIS");
96			BufferedWriter bw = new BufferedWriter(osw);
97			PrintWriter pw = new PrintWriter(bw);
98			List<Entry<String,Integer>> entries=new ArrayList<>(wordFreqMap.entrySet());
99			Collections.sort(entries,new Comparator<>() {
100				public int compare(Entry<String,Integer> obj1,Entry<String,Integer> obj2) {
101					return obj2.getValue().compareTo(obj1.getValue());
102				}
103			});
104			pw.println("Top "+n+" words in frequency");
105			int i=0;
106			for(Entry<String,Integer> entry:entries) {
107				if(i==n) {
108					break;
109				}
110				pw.println((i+1)+":"+entry.getKey()+"("+entry.getValue()+")");
111				i++;
112			}
113			pw.println();
114			pw.close();
115		} catch (IOException e) {
116			e.printStackTrace();
117		}
118	}
119
120	// for 7th week: Exercise 7-1
121	// Set sourceWordCountMap that stores the pairs of a sourceName (key) and
122	// the instance of WordCount (value).
123	static void setSourceWordCountMap(String inputDirPath) {
124		
125		File dir=new File(inputDirPath);
126		File [] files=dir.listFiles();
127		
128		for(File file : files) {
129			sourceWordCountMap.put(file.getName(),new WordCount(new File(inputDirPath,file.getName())));
130			sourceWordCountMap.get(file.getName()).readFile();
131		}
132	}
133
134	// for 7th week: Exercise 7-2
135	// Calculates the TF-IDF weight of each word stored in its instance.
136	// See the slide p.15 for the definition of TF-IDF.
137	void calTFIDF() {
138		
139		
140
141		for(WordCount wc : sourceWordCountMap.values() ) {
142			
143			Set<String> words=wc.getWordSet();
144			
145			double nd=0;
146			for(String word : words) {
147				nd+=wc.wordFreqMap.get(word);
148			}
149			//System.out.println(nd);
150			double docNum=sourceWordCountMap.size();
151			//System.out.println(docNum);
152			
153			
154			
155			
156			
157			for(String word : words) {
158				double nwd=wc.wordFreqMap.get(word);
159				//System.out.println(word);
160				//System.out.println(nwd);
161				double tf=nwd/nd;
162				//System.out.println(tf);
163				double dw=docFreqMap.get(word);
164				//System.out.println(dw);
165				
166				double idf=Math.log(docNum/dw);
167				
168				double ans=tf*idf;
169				
170				tfidfWeight.put(word,((double)Math.round(ans*1000))/1000);
171				
172				
173				
174				
175				
176				
177			}
178			
179			
180		}
181		
182		
183	}
184
185	// for 7th week: Exercise 7-2
186	// Writes the top n words in TF-IDF weight (together with their TF-IDF weight).
187	void writeTopNTfIdfWords(int n, File outputFile) {
188		try {
189			OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(outputFile), "Shift-JIS");
190			BufferedWriter bw = new BufferedWriter(osw);
191			PrintWriter pw = new PrintWriter(bw);
192			
193			List<Entry<String,Double>> entries=new ArrayList<>(tfidfWeight.entrySet());
194			Collections.sort(entries,new Comparator<>() {
195				public int compare(Entry<String,Double> obj1,Entry<String,Double> obj2) {
196					return obj2.getValue().compareTo(obj1.getValue());
197				}
198			});
199			pw.println("Top "+n+" words in TF-IDF");
200			int i=0;
201			for(Entry<String,Double> entry:entries) {
202				if(i==n) {
203					break;
204				}
205				pw.println((i+1)+":"+entry.getKey()+"("+entry.getValue()+")");
206				i++;
207			}
208			pw.println();
209			
210			
211			
212			pw.close();
213		} catch (IOException e) {
214			e.printStackTrace();
215		}
216	}
217
218	// for 7th week: Exercise 7-2
219	// Set docFreqMap that stores the pairs of a word (key) and the number of
220	// documents in which the word appears (value).
221	static void setDocFreqMap() {
222		
223		int count=0;
224		
225		for(WordCount wc : sourceWordCountMap.values() ) {
226			
227			Set<String> words=wc.getWordSet();
228			for(String word : words) {
229				if(!word.isEmpty()) {
230					if(docFreqMap.containsKey(word)) {
231						count=docFreqMap.get(word)+1;
232						docFreqMap.put(word,count);
233					}else {
234						docFreqMap.put(word,1);
235					}
236				}
237			}
238			
239			
240		}
241			
242		
243			
244	}
245
246	public static void main(String[] args) {
247		if (args.length != 3) {
248			System.err.println(
249					"java WordCount [test1|test2|test3|test4|test5|freq|tfidf] [input dir path] [source name]");
250			System.exit(-1);
251		}
252		String type = args[0];
253		String inputDirPath = args[1];
254		String sourceName = args[2];
255
256		setSourceWordCountMap(inputDirPath);
257		setDocFreqMap();
258		WordCount wc = sourceWordCountMap.get(sourceName);
259		File outputFile = new File(type + "_" + sourceName);
260		wc.calTFIDF();
261		switch (type) {
262		
263		case "tfidf": // for Exercise 7-2
264			wc.writeTopNTfIdfWords(5, outputFile);
265			break;
266		default:
267			System.err.println(
268					"java WordCount [test1|test2|test3|test4|test5|freq|tfidf] [input dir path] [source name]");
269		}
270	}
271
272}
273

m.ts10806

2021/11/13 06:50

おかしいとは、どうなる予定がどうなっているのでしょう。 https://teratail.com/help/question-tips#questionTips3-4

jimbe

2021/11/13 12:30

あちこちに「//for 7th week」等ありますが、何かの教材からのコードでしょうか。

hanao_

2021/11/13 18:30

Top 5 words in TF-IDF weight 1: Hyundai(0.027) 2: Hyundai's(0.011) 3: dealers(0.006) 4: market(0.006) 5: auto(0.005) ↑のような結果が出てほしいのですが、 Top 5 words in TF-IDF 1: Tesla(0.041) 2: Hyundai(0.027) 3: Model(0.027) 4: company(0.024) 5: sanctions(0.021) 自分のプログラムでは↑のような出力になってしまいます、、、 HyundaiのTD-IDF値は正しく出ているのでTF-IDF値の計算自体は間違っていないと思うのですが、、、

hanao_

2021/11/13 18:34

教材からのコードです

jimbe

2021/11/13 19:02

その教材はどうしたら見れるでしょうか。また、ご提示の結果になるデータはどこから得られるでしょうか。