csvファイルのテキストの特殊記号の除去

CSVファイルのテキストデータの特殊記号除去

テキストデータのdf['タイトル'] df['説明文'] df['タグ']について,
それぞれ下のコードにある特殊記号を削除したいです。
そして、処理後の確認ができるように、df['タイトル2'] df['説明文2'] df['タグ2']
として出力させる方法を教えてほしいです。

python
1import pandas as pd
2df = pd.read_csv("input.csv", encoding="utf_8")
3
4puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\', '•',  '~', '@', '£',
5 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
6 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
7 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
8 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

追記

頂いた回答をもとに、次のように実行したところこのようなエラーが出ました。
position17というのは、punctsの17個目にエラーの原因があるということでしょうか？

python
1puncts= r',|.|"|:|)|(|-|!|?|;|""|$|&|/|[|]|\>|%|=|#|*|+|\|•|~|@|£|·|_|{|}|©|^|®|`|<|→|°|€|™|›|♥|←|×|§|″|′|Â|█|½|à|…|\n|\xa0|\t|“|★|”|–|●|â|►|−|¢|²|¬|░|¶|↑|±|¿|▾|═|¦|║|―|¥|▓|—|‹|─|\u3000|\u202f|▒|：|¼|⊕|▼|▪|†|■|’|▀|¨|▄|♫|☆|é|¯|♦|¤|▲|è|¸|¾|Ã|⋅|‘|∞|«|∙|）|↓|、|│|（|»|||♪|╩|╚|³|・|╦|╣|╔|╗|▬|❤|ï|Ø|¹|≤|‡|√'
2
3def puncts_rm(s):
4    return re.sub(puncts, '', s)
5
6(省略)
7
8df["title2"] = df["title2"].apply(puncts_rm)
9print(df["title2"])

(省略)

10 df["title2"] = df["title2"].apply(puncts_rm)
     11 print(df["title2"])

~\anaconda3\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
   3589             else:
   3590                 values = self.astype(object).values
-> 3591                 mapped = lib.map_infer(values, f, convert=convert_dtype)
   3592 
   3593         if len(mapped) and isinstance(mapped[0], Series):

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-84-2b440d6a84a2> in puncts_rm(s)
      1 def puncts_rm(s):
----> 2     return re.sub(puncts, '', s)
      3 
      4 def html_tags_rm(s):
      5     return re.sub(html_tags, '', s)

~\anaconda3\lib\re.py in sub(pattern, repl, string, count, flags)
    190     a callable, it's passed the Match object and must return
    191     a replacement string to be used."""
--> 192     return _compile(pattern, flags).sub(repl, string, count)
    193 
    194 def subn(pattern, repl, string, count=0, flags=0):

~\anaconda3\lib\re.py in _compile(pattern, flags)
    284     if not sre_compile.isstring(pattern):
    285         raise TypeError("first argument must be string or compiled pattern")
--> 286     p = sre_compile.compile(pattern, flags)
    287     if not (flags & DEBUG):
    288         if len(_cache) >= _MAXCACHE:

~\anaconda3\lib\sre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

~\anaconda3\lib\sre_parse.py in parse(str, flags, pattern)
    922 
    923     try:
--> 924         p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, 0)
    925     except Verbose:
    926         # the VERBOSE flag was switched on inside the pattern.  to be

~\anaconda3\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    418     while True:
    419         itemsappend(_parse(source, state, verbose, nested + 1,
--> 420                            not nested and not items))
    421         if not sourcematch("|"):
    422             break

~\anaconda3\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    808             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
    809                            not (del_flags & SRE_FLAG_VERBOSE))
--> 810             p = _parse_sub(source, state, sub_verbose, nested + 1)
    811             if not source.match(")"):
    812                 raise source.error("missing ), unterminated subpattern",

~\anaconda3\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    418     while True:
    419         itemsappend(_parse(source, state, verbose, nested + 1,
--> 420                            not nested and not items))
    421         if not sourcematch("|"):
    422             break

~\anaconda3\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    643             if not item or item[0][0] is AT:
    644                 raise source.error("nothing to repeat",
--> 645                                    source.tell() - here + len(this))
    646             if item[0][0] in _REPEATCODES:
    647                 raise source.error("multiple repeat",

error: nothing to repeat at position 17

行動規範の内容に同意します

回答2件

質問者は正規表現の知識が不足していて、自分でメタ文字となる記号をエスケープするのは大変そうなので、re.escapeを使ってコード側で自動的にエスケープし、|で接続してパターンを生成する。

Python
1import pandas as pd
2import re
3import io
4
5# CSVファイルの代わりに使う文字列
6txt = """
7タイトル,説明文,タグ
8:a::a:a::,AAA<BBB>CCC,"aaa,bbb,ccc"
9'bbb(xxx)',BBB=CCC,"bbb,ccc"
10ccc:,"CCC
11DDD
12EEE","ccc,""ddd"",eee"
13"""
14
15puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\', '•', '~', '@', '£',
16          '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '\n', '\xa0', '\t',
17          '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '\u3000', '\u202f',
18          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '«',
19          '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
20
21pd.set_option('display.unicode.east_asian_width', True) # 日本語混じりのdfの出力を揃える
22
23df = pd.read_csv(io.StringIO(txt)) # ファイルから読む場合はこちらの先頭に「#」を付ける
24# df = pd.read_csv("input.csv", encoding="utf_8") # ファイルから読む場合は先頭の「#」を取る
25
26# 除去用の正規表現パターンを生成
27puncts_escaped = [re.escape(p) for p in puncts]
28pat = "|".join(puncts_escaped)
29
30# 正規表現パターンを利用して除去を行ない、新たな列を生成
31df['タイトル2'] = df['タイトル'].apply(lambda x: re.sub(pat, '', x))
32df['説明文2'] = df['説明文'].apply(lambda x: re.sub(pat, '', x))
33df['タグ2'] = df['タグ'].apply(lambda x: re.sub(pat, '', x))
34
35# 結果を画面に表示
36print(df)

result
1     タイトル         説明文           タグ タイトル2    説明文2      タグ2
20   :a::a:a::    AAA<BBB>CCC    aaa,bbb,ccc       aaa  AAABBBCCC  aaabbbccc
31  'bbb(xxx)'        BBB=CCC        bbb,ccc    bbbxxx     BBBCCC     bbbccc
42        ccc:  CCC\nDDD\nEEE  ccc,"ddd",eee       ccc  CCCDDDEEE  cccdddeee
5

投稿2021/02/01 01:14

Daregada

総合スコア11990

ベストアンサー

正規表現を使って置き換えればできます。

python
1>>> import pandas as pd
2>>> import re
3>>> df = pd.DataFrame({'タイトル':['aaa:', 'bbb:'], '説明文':['AAA<BBB','BBB=CCC'], 'タグ':['aaa,bbb', 'bbb,ccc']})
4>>>
5>>> pattern = r':|,|<|='
6>>>
7>>> def rm(s):
8...     return re.sub(pattern, '', s)
9...
10>>> df['タイトル2'] = df['タイトル'].apply(rm)
11>>> df['説明文2'] = df['説明文'].apply(rm)
12>>> df['タグ2'] = df['タグ'].apply(rm)
13>>> print(df)
14   タイトル      説明文       タグ タイトル2    説明文2     タグ2
150  aaa:  AAA<BBB  aaa,bbb   aaa  AAABBB  aaabbb
161  bbb:  BBB=CCC  bbb,ccc   bbb  BBBCCC  bbbccc

あとは、patternを増やしていってください。

追加の質問について

プログラムの17行目という意味だと思います。
*は特殊文字なので、エスケープしないとこのメッセージが出ます。

python
1>>> re.sub(r'*', '', 'abc')
2Traceback (most recent call last):
3  File "<stdin>", line 1, in <module>
4  File "C:\Users\shinp\anaconda3\lib\re.py", line 210, in sub
5    return _compile(pattern, flags).sub(repl, string, count)
6  File "C:\Users\shinp\anaconda3\lib\re.py", line 304, in _compile
7    p = sre_compile.compile(pattern, flags)
8  File "C:\Users\shinp\anaconda3\lib\sre_compile.py", line 764, in compile
9    p = sre_parse.parse(p, flags)
10  File "C:\Users\shinp\anaconda3\lib\sre_parse.py", line 948, in parse
11    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
12  File "C:\Users\shinp\anaconda3\lib\sre_parse.py", line 443, in _parse_sub
13    itemsappend(_parse(source, state, verbose, nested + 1,
14  File "C:\Users\shinp\anaconda3\lib\sre_parse.py", line 668, in _parse
15    raise source.error("nothing to repeat",
16re.error: nothing to repeat at position 0

これは、

python
1>>> re.sub(r'*', '', 'abc')
2'abc'

と修正すれば大丈夫です。
他にも、特殊文字があるのでre --- 正規表現操作のページ内で「特殊文字」で検索してよく読んでください。

投稿2021/01/31 21:57

編集2021/01/31 23:58

ppaul

総合スコア24672

fukubaka

2021/01/31 23:28

回答して頂きありがとうございます。実行したところ、新たなエラーが出ました。追記として質問欄に記載してありますので、お時間ございましたら確認お願い致します。

fukubaka

2021/02/01 01:25

ありがとうございました。無事、除去することができました。

行動規範の内容に同意します

あなたの回答

tips

プレビュー

行動規範の内容に同意します

質問の解決につながる回答をしましょう。サンプルコードなど、より具体的な説明があると質問者の理解の助けになります。また、読む側のことを考えた、分かりやすい文章を心がけましょう。

15分調べてもわからないことは
teratailで質問しよう！

ただいまの回答率
85.30%

質問をまとめることで
思考を整理して素早く解決

テンプレート機能で
簡単に質問をまとめる

質問する

CSVファイルのテキストデータの特殊記号除去

追記

関連した質問