hashを用いた高速化を実現する

前提・実現したいこと

Wikipediaの題名とカテゴリを用いてカテゴリから題名を出力するプログラムを作成したいです、
現在hashを使わずにソースを作成したのですが実行が遅いのでhashを使って高速にしたいです。

発生している問題・

hashで1つの題名に１つのカテゴリなら作成できると思うのですが
複数のカテゴリを付与する場合どのようにすればいいのでしょうか？

該当のソースコード・現状

ソースコード
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>

using namespace std;

// 機能：引数として渡されたカテゴリが割り当てられているページのタイトルを検索する
// 引数：カテゴリ名
// 戻値：検索結果　タイトル１,タイトル2,...のような , で区切られた文字列

string search_page(string query)
{
    ifstream f_data("data.txt"); 

    string result = "";
    string line;

    // data.txtから１行ずつ読み込み
    while(f_data >> line)
    {
        // 記事タイトルとカテゴリ（category1,category2,...）のデータに分割
        int index = line.find(","); 
        string title = line.substr(0,index); 
        string c_str = line.substr(index+1); 
	
	// カテゴリ列のデータからカテゴリを１つずつ取り出し（split）
	int n;
	for(int i=0; i <= c_str.length(); i=n+1 )
	{
	    n = c_str.find_first_of(",", i);
	    if( n == string::npos ) 
	    {
		n = c_str.length();
	    }

	    string cw = c_str.substr(i, n-i );

	    if(query == cw)
	    {
	        result += title+",";
	    }
	}
   }
   f_data.close();

   return result;
}

int main(int argc, char *argv[])
{     
    ifstream fc_data("category.list"); 

    string result;
    string line;

    // data.txtから１行ずつ読み込み
    while(fc_data >> line)
    {
        cout << line << " -> ";
	
        string result = search_page(line);

	if(result != "")
	{
	   cout << result << endl;
        }
	
	cout << endl;
    }
    fc_data.close();
   
   return 0;
}

###改善版/ぐちゃぐちゃです

#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <string.h>
#include<cmath>
#define HashSize 1000;
using namespace std;

///////////ここでカテゴリの複数化を図りたい
struct cell
{
   string name;   // 記事
   string categori;    // カテゴリ

   struct cell *next;	// 次のデータへのポインタ
};
// ハッシュ表 
struct hashhash
{
  int    size;           // ハッシュ表の大きさ 
  struct cell **table; 
};
// ハッシュ関数（keyのハッシュ値を返す）
// 文字列の1文字ごとの文字コードを加算するハッシュ関数
int hash1(string key, int size) 
{
  unsigned int v=0;

  for(int i=0;i<key.length();i++) 
    {
      v+=key[i]*pow(2,i);
    }

  return v%size;
}
/*--- ハッシュ表の初期化 ---*/
int initialize(struct hashhash *h, int size)
{
  h->table=(cell**)new struct cell[size];
  h->size=size;

  for(int i=0;i<size;i++)
    {
      h->table[i]=NULL;
    }
  return 1;
}
// データの追加 
int add(struct hashhash *h, string name, string categori)
{
  //  追加するデータのハッシュ値 
  int hash_value=hash1(name, h->size);	

  // 接続するデータを格納する要素を生成
  struct cell *temp;
  temp=new struct cell;
  temp->name=name;
  temp->categori=categori;

  // 要素をハッシュに追加（上書きで追加）
  temp->next=h->table[hash_value]; 
  h->table[hash_value]=temp;
  return 0; 
}

struct cell *search_tel(struct hashhash *head, string name)
{
  int hash_value=hash1(name,head->size);
  struct cell *buf=head->table[hash_value];
  while(buf!=NULL)
    {
      if(buf->name == name)
	{
	  return buf;
	}
      buf=buf->next;
    }
  return NULL;
}


/*string search_page(string query)
{
    ifstream f_data("data.txt"); 

    string result = "";
    string line;

    // data.txtから１行ずつ読み込み
    while(f_data >> line)
    {
        // 記事タイトルとカテゴリ（category1,category2,...）のデータに分割
        int index = line.find(","); 
        string title = line.substr(0,index); 
        string c_str = line.substr(index+1); 
	
	// カテゴリ列のデータからカテゴリを１つずつ取り出し（split）
	int n;
	for(int i=0; i <= c_str.length(); i=n+1 )
	{
	    n = c_str.find_first_of(",", i);
	    if( n == string::npos ) 
	    {
		n = c_str.length();
	    }

	    string cw = c_str.substr(i, n-i );

	    if(query == cw)
	    {
	        result += title+",";
	    }
	}
   }
   f_data.close();

   return result;
   }*/

int main(int argc, char *argv[])
{     
    ifstream fc_data("category.list"); 

    string result;
    string line;

    // data.txtから１行ずつ読み込み
    while(fc_data >> line)
    {
        cout << line << " -> ";
	
        string result = search_page(line);

	if(result != "")
	{
	   cout << result << endl;
        }
	
	cout << endl;
    }
    fc_data.close();
   
   return 0;
}

試したこと

未だ初心者でhashを使うとよくわからなくなってしまいます

行動規範の内容に同意します

回答3件

ベストアンサー

カテゴリを複数にするなら、もうテキストファイルをやめてsqlite等でデータベース化してしまった方が良いかもしれません。

記事テーブル（記事ID,記事タイトル）
カテゴリテーブル（記事ID、カテゴリ）

のように分けてテーブルを作成しておけば、複数のカテゴリも対応出来ますし、
タイトル、カテゴリのインデックスを作成しておけば、高速に検索出来ます。

投稿2020/07/27 04:46

退会済みユーザー

総合スコア0

Amateur0845

2020/07/27 19:15

確かにデータベース化することでメリットが多いですね。もう少しそちらの方向性で試行錯誤してみます。ありがとうございます。

行動規範の内容に同意します

正直hash化してもそんなに速くならないんじゃないですかね。どちらかというとメモリー確保を減らすほうがいいと思います。

cpp
1        string title = line.substr(0,index); 
2        string c_str = line.substr(index+1); 
3
4string cw = c_str.substr(i, n-i );

これらは全部新規にメモリーを確保します。std::string_viewを使うようにするとメモリー確保がいらなくなります。

cpp
1    string raw_line;
2
3    // data.txtから１行ずつ読み込み
4    while(f_data >> raw_line)
5    {
6        std::string_view line = raw_line;
7        // 記事タイトルとカテゴリ（category1,category2,...）のデータに分割
8        int index = line.find(","); 
9        auto title = line.substr(0,index); 
10        auto c_str = line.substr(index+1); 
11
12        // カテゴリ列のデータからカテゴリを１つずつ取り出し（split）
13        int n;
14        for(int i=0; i <= c_str.length(); i=n+1 )
15        {
16            n = c_str.find_first_of(",", i);
17            if( n == string::npos ) 
18            {
19            n = c_str.length();
20            }
21    
22            auto cw = c_str.substr(i, n-i );
23
24            if(query == cw)
25            {
26                result += title+",";
27            }
28        }
29   }

あとはハッシュ化はstd::hashを使えばいいと思います。

投稿2020/07/27 03:34

yumetodo

総合スコア5850

[ご参考] 要素の重複を許すハッシュ表(unordered_multimap)を使った例

C++
1#include <unordered_map>
2#include <string>
3#include <set>
4#include <algorithm>
5#include <iterator>
6#include <iostream>
7
8int main() {
9  using dict_type = std::unordered_multimap<std::string,std::string>;
10  using key_value = dict_type::value_type;
11  dict_type dict; // 辞書
12  // (カテゴリ,題名) を辞書に登録
13  dict.insert(key_value("赤","りんご"));
14  dict.insert(key_value("果物", "りんご"));
15  dict.insert(key_value("赤", "さくらんぼ"));
16  dict.insert(key_value("果物", "さくらんぼ"));
17  dict.insert(key_value("赤", "信号機"));
18  dict.insert(key_value("黄", "信号機"));
19  dict.insert(key_value("緑", "信号機"));
20  std::cout << "辞書 : ";
21  for ( auto item : dict ) {
22    std::cout << '(' << item.first << ',' << item.second << ") ";
23  }
24  std::cout << std::endl;
25
26  std::set<std::string> result1;
27  auto matches = dict.equal_range("赤");
28  while ( matches.first != matches.second ) result1.insert( matches.first++->second);
29
30  std::set<std::string> result2;
31  matches = dict.equal_range("果物");
32  while (matches.first != matches.second) result2.insert(matches.first++->second);
33
34  std::cout << "赤 または 果物 : ";
35  std::set_union(result1.begin(), result1.end(), result2.begin(), result2.end(),
36                 std::ostream_iterator<std::string>(std::cout, " "));
37  std::cout << std::endl;
38
39  std::cout << "赤 かつ　 果物 : ";
40  std::set_intersection(result1.begin(), result1.end(), result2.begin(), result2.end(),
41      std::ostream_iterator<std::string>(std::cout, " "));
42  std::cout << std::endl;
43}

実行結果

辞書 : (赤,りんご) (赤,さくらんぼ) (赤,信号機) (黄,信号機) (果物,りんご) (果物,さくらんぼ) (緑,信号機)
赤 または 果物 : さくらんぼ りんご 信号機
赤 かつ　 果物 : さくらんぼ りんご

投稿2020/07/27 04:26

episteme

総合スコア16614

あなたの回答

tips

プレビュー

行動規範の内容に同意します

質問の解決につながる回答をしましょう。サンプルコードなど、より具体的な説明があると質問者の理解の助けになります。また、読む側のことを考えた、分かりやすい文章を心がけましょう。

15分調べてもわからないことは
teratailで質問しよう！

ただいまの回答率
85.47%

質問をまとめることで
思考を整理して素早く解決

テンプレート機能で
簡単に質問をまとめる

質問する

質問をすることでしか得られない、回答やアドバイスがある。

15分調べてもわからないことは、質問しよう！