if-idf値を計算し、ファイルに書き出すプログラムを書いているのですが、ファイルに出力される値がおかしくなってしまいます。
5つの入力ファイルから単語を読み取り、単語の出現回数や特定の単語が5つのファイルの内いくつのファイルに存在するかなどを数えて、tf-idf値の上位5つの値をファイルに書き出したいです。
何度もプログラムを見返してみたのですが、どこが悪いのかわかりません、、
wordFreqMapは単語とその出現回数、sourceWordCountMapはソースファイルとWordCountのインスタンス、docFreqMapは単語とその単語が含まれるファイルの数保管するものです。
プログラミングが得意ではなくて、質問の仕方もわかりにくいとは思いますが、よろしくお願いいたします、、、
java
1package p7; 2 3import java.io.*; 4import java.lang.*; 5import java.util.*; 6 7import java.util.Map.Entry; 8 9//for 7th week 10public class WordCount { 11 12 private File sourceFile; 13 private int total; 14 private HashMap<String, Integer> wordFreqMap; 15 // for 7th week 16 private HashMap<String, Double> tfidfWeight; 17 private static HashMap<String, WordCount> sourceWordCountMap = new HashMap<>(); 18 private static HashMap<String, Integer> docFreqMap = new HashMap<>(); 19 20 // for 7th week: Exercise 7-1 21 // Initialize sourceFile, total, wordFreqMap, and tfidfWeight fields. 22 WordCount(File sourceFile) { 23 setSourceFile(sourceFile); 24 total=0; 25 wordFreqMap=new HashMap<>(); 26 tfidfWeight=new HashMap<>(); 27 } 28 29 // for 7th week: Exercise 7-1 30 String getSourceName() { 31 return sourceFile.getName(); 32 } 33 34 // for 7th week: Exercise 7-1 35 void setSourceFile(File sourceFile) { 36 this.sourceFile=sourceFile; 37 } 38 39 // for 6th week 40 // Copy the code from TextInfo.java 41 private void countFreq(String word1) { 42 String[] words = word1.split(" "); 43 for (String word : words) { 44 if (!word.isEmpty()) { 45 if (wordFreqMap.containsKey(word)) { 46 int count = wordFreqMap.get(word) + 1; 47 wordFreqMap.put(word, count); 48 } else { 49 wordFreqMap.put(word, 1); 50 } 51 } 52 } 53 List<Entry<String,Integer>> entries=new ArrayList<>(wordFreqMap.entrySet()); 54 Collections.sort(entries,new Comparator<>() { 55 public int compare(Entry<String,Integer> obj1,Entry<String,Integer> obj2) { 56 return obj2.getValue().compareTo(obj1.getValue()); 57 } 58 }); 59 } 60 61 // for 7th week: Exercise 7-1 62 void readFile() { 63 try { 64 InputStreamReader isr = new InputStreamReader(new FileInputStream(sourceFile), "Shift-JIS"); 65 BufferedReader br = new BufferedReader(isr); 66 String s, token[]; 67 while ((s = br.readLine()) != null) { 68 if (s.isEmpty()) { 69 continue; 70 } 71 countFreq(s); 72 } 73 br.close(); 74 } catch (Exception e) { 75 e.printStackTrace(); 76 } 77 } 78 79 // for 6th week 80 // Copy the code from TextInfo.java 81 int getFreq(String word) { 82 return wordFreqMap.get(word); 83 } 84 85 // for 7th week: Exercise 7-2 86 // Returns the set of words stored in its instance. 87 Set<String> getWordSet() { 88 return this.wordFreqMap.keySet(); 89 } 90 91 // for 7th week: Exercise 7-1 92 // Writes the top n words in frequency (together with their frequency). 93 void writeTopNFrequentWords(int n, File outputFile) { 94 try { 95 OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(outputFile), "Shift-JIS"); 96 BufferedWriter bw = new BufferedWriter(osw); 97 PrintWriter pw = new PrintWriter(bw); 98 List<Entry<String,Integer>> entries=new ArrayList<>(wordFreqMap.entrySet()); 99 Collections.sort(entries,new Comparator<>() { 100 public int compare(Entry<String,Integer> obj1,Entry<String,Integer> obj2) { 101 return obj2.getValue().compareTo(obj1.getValue()); 102 } 103 }); 104 pw.println("Top "+n+" words in frequency"); 105 int i=0; 106 for(Entry<String,Integer> entry:entries) { 107 if(i==n) { 108 break; 109 } 110 pw.println((i+1)+":"+entry.getKey()+"("+entry.getValue()+")"); 111 i++; 112 } 113 pw.println(); 114 pw.close(); 115 } catch (IOException e) { 116 e.printStackTrace(); 117 } 118 } 119 120 // for 7th week: Exercise 7-1 121 // Set sourceWordCountMap that stores the pairs of a sourceName (key) and 122 // the instance of WordCount (value). 123 static void setSourceWordCountMap(String inputDirPath) { 124 125 File dir=new File(inputDirPath); 126 File [] files=dir.listFiles(); 127 128 for(File file : files) { 129 sourceWordCountMap.put(file.getName(),new WordCount(new File(inputDirPath,file.getName()))); 130 sourceWordCountMap.get(file.getName()).readFile(); 131 } 132 } 133 134 // for 7th week: Exercise 7-2 135 // Calculates the TF-IDF weight of each word stored in its instance. 136 // See the slide p.15 for the definition of TF-IDF. 137 void calTFIDF() { 138 139 140 141 for(WordCount wc : sourceWordCountMap.values() ) { 142 143 Set<String> words=wc.getWordSet(); 144 145 double nd=0; 146 for(String word : words) { 147 nd+=wc.wordFreqMap.get(word); 148 } 149 //System.out.println(nd); 150 double docNum=sourceWordCountMap.size(); 151 //System.out.println(docNum); 152 153 154 155 156 157 for(String word : words) { 158 double nwd=wc.wordFreqMap.get(word); 159 //System.out.println(word); 160 //System.out.println(nwd); 161 double tf=nwd/nd; 162 //System.out.println(tf); 163 double dw=docFreqMap.get(word); 164 //System.out.println(dw); 165 166 double idf=Math.log(docNum/dw); 167 168 double ans=tf*idf; 169 170 tfidfWeight.put(word,((double)Math.round(ans*1000))/1000); 171 172 173 174 175 176 177 } 178 179 180 } 181 182 183 } 184 185 // for 7th week: Exercise 7-2 186 // Writes the top n words in TF-IDF weight (together with their TF-IDF weight). 187 void writeTopNTfIdfWords(int n, File outputFile) { 188 try { 189 OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(outputFile), "Shift-JIS"); 190 BufferedWriter bw = new BufferedWriter(osw); 191 PrintWriter pw = new PrintWriter(bw); 192 193 List<Entry<String,Double>> entries=new ArrayList<>(tfidfWeight.entrySet()); 194 Collections.sort(entries,new Comparator<>() { 195 public int compare(Entry<String,Double> obj1,Entry<String,Double> obj2) { 196 return obj2.getValue().compareTo(obj1.getValue()); 197 } 198 }); 199 pw.println("Top "+n+" words in TF-IDF"); 200 int i=0; 201 for(Entry<String,Double> entry:entries) { 202 if(i==n) { 203 break; 204 } 205 pw.println((i+1)+":"+entry.getKey()+"("+entry.getValue()+")"); 206 i++; 207 } 208 pw.println(); 209 210 211 212 pw.close(); 213 } catch (IOException e) { 214 e.printStackTrace(); 215 } 216 } 217 218 // for 7th week: Exercise 7-2 219 // Set docFreqMap that stores the pairs of a word (key) and the number of 220 // documents in which the word appears (value). 221 static void setDocFreqMap() { 222 223 int count=0; 224 225 for(WordCount wc : sourceWordCountMap.values() ) { 226 227 Set<String> words=wc.getWordSet(); 228 for(String word : words) { 229 if(!word.isEmpty()) { 230 if(docFreqMap.containsKey(word)) { 231 count=docFreqMap.get(word)+1; 232 docFreqMap.put(word,count); 233 }else { 234 docFreqMap.put(word,1); 235 } 236 } 237 } 238 239 240 } 241 242 243 244 } 245 246 public static void main(String[] args) { 247 if (args.length != 3) { 248 System.err.println( 249 "java WordCount [test1|test2|test3|test4|test5|freq|tfidf] [input dir path] [source name]"); 250 System.exit(-1); 251 } 252 String type = args[0]; 253 String inputDirPath = args[1]; 254 String sourceName = args[2]; 255 256 setSourceWordCountMap(inputDirPath); 257 setDocFreqMap(); 258 WordCount wc = sourceWordCountMap.get(sourceName); 259 File outputFile = new File(type + "_" + sourceName); 260 wc.calTFIDF(); 261 switch (type) { 262 263 case "tfidf": // for Exercise 7-2 264 wc.writeTopNTfIdfWords(5, outputFile); 265 break; 266 default: 267 System.err.println( 268 "java WordCount [test1|test2|test3|test4|test5|freq|tfidf] [input dir path] [source name]"); 269 } 270 } 271 272} 273
あなたの回答
tips
プレビュー