Pythonでデータの記述統計を求めたいです

Pythonで、スポーツの実施状況等に関する世論調査(令和3年11月調査）のローデータをすべて用いて、男女別の身長と体重の記述統計(平均値、中央値、標準偏差、四分位範囲）を計算したいのですが、

csvファイルにデータを読み込んだ後に、

print(df.describe())

SAMPLENUMBER Q1 Q2.1 Q3 Q4 Q5.1 Q5.2 Q6 Q7 Q7-18
count 2000 2001 2001 2001 2001 2001 2001 2001 2001 40
unique 2000 3 63 48 5 52 72 5 20 28
top 18001 2 70 40 2 160 50 4 15 パート
freq 1 1406 73 212 688 155 142 1928 497 8

... Q48-4 Q48-5 Q48-6 Q48-7 Q48-8 Q48-9 Q48-10 Q48-11 Q48-11-F Q48-12

count ... 2001 2001 2001 2001 2001 2001 2001 2001 8 2001
unique ... 3 3 3 3 3 3 3 3 8 3
top ... 0 0 0 0 0 0 0 0 その他：＿ 1
freq ... 1881 1956 1903 1861 1847 1948 1828 1993 1 1270

[4 rows x 2914 columns]

df.iloc[1:].mean()[['Q5.1','Q1']]
と入力してみると、

C:\Users\Owner\AppData\Local\Temp\ipykernel_2652\3443647166.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError. Select only valid columns before calling the reduction.
df.iloc[1:].mean()[['Q5.1','Q1']]

KeyError Traceback (most recent call last)
Input In [12], in <cell line: 1>()
----> 1 df.iloc[1:].mean()[['Q5.1','Q1']]

File ~\anaconda3\lib\site-packages\pandas\core\series.py:984, in Series.getitem(self, key)
981 key = np.asarray(key, dtype=bool)
982 return self._get_values(key)
--> 984 return self._get_with(key)

File ~\anaconda3\lib\site-packages\pandas\core\series.py:1024, in Series._get_with(self, key)
1021 return self.iloc[key]
1023 # handle the dup indexing case GH#4246
-> 1024 return self.loc[key]

File ~\anaconda3\lib\site-packages\pandas\core\indexing.py:967, in _LocationIndexer.getitem(self, key)
964 axis = self.axis or 0
966 maybe_callable = com.apply_if_callable(key, self.obj)
--> 967 return self._getitem_axis(maybe_callable, axis=axis)

File ~\anaconda3\lib\site-packages\pandas\core\indexing.py:1191, in _LocIndexer._getitem_axis(self, key, axis)
1188 if hasattr(key, "ndim") and key.ndim > 1:
1189 raise ValueError("Cannot index with multidimensional key")
-> 1191 return self._getitem_iterable(key, axis=axis)
1193 # nested tuple slicing
1194 if is_nested_tuple(key, labels):

File ~\anaconda3\lib\site-packages\pandas\core\indexing.py:1132, in _LocIndexer._getitem_iterable(self, key, axis)
1129 self._validate_key(key, axis)
1131 # A collection of keys
-> 1132 keyarr, indexer = self._get_listlike_indexer(key, axis)
1133 return self.obj._reindex_with_indexers(
1134 {axis: [keyarr, indexer]}, copy=True, allow_dups=True
1135 )

File ~\anaconda3\lib\site-packages\pandas\core\indexing.py:1327, in _LocIndexer._get_listlike_indexer(self, key, axis)
1324 ax = self.obj._get_axis(axis)
1325 axis_name = self.obj._get_axis_name(axis)
-> 1327 keyarr, indexer = ax._get_indexer_strict(key, axis_name)
1329 return keyarr, indexer

File ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py:5782, in Index._get_indexer_strict(self, key, axis_name)
5779 else:
5780 keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 5782 self._raise_if_missing(keyarr, indexer, axis_name)
5784 keyarr = self.take(indexer)
5785 if isinstance(key, Index):
5786 # GH 42790 - Preserve name from an Index

File ~\anaconda3\lib\site-packages\pandas\core\indexes\base.py:5842, in Index._raise_if_missing(self, key, indexer, axis_name)
5840 if use_interval_msg:
5841 key = list(key)
-> 5842 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5844 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
5845 raise KeyError(f"{not_found} not in index")

KeyError: "None of [Index(['Q5.1', 'Q1'], dtype='object')] are in the [index]"

と表示されてしまうのですが、どう対応すればよいのかを教えていただきたいです。
(データのQ1は性別で、１は男性、2は女性、Q5.1は身長です）

jbpb0

2023/01/26 11:25

https://teratail.com/help/avoid-asking の「過去に投稿した質問と同じ内容の質問」を見てください https://teratail.com/questions/s2d1lsqsbe1c56

行動規範の内容に同意します

回答1件

ローデータ全てをダウンロードして結合するのは面倒くさいので、試しに、令和3年度「スポーツの実施状況等に関する世論調査」(令和3年11月調査):スポーツ庁から 1 令和3年度スポーツ世論調査_ローデータ(1~2,000) (Excel:4.8MB)をダウンロードして記述統計量を計算してみました。

python
1import pandas as pd
2
3excel_file = '20220310-spt_kensport01-000020487_9.xlsx'
4# URL を指定しても可
5# excel_file = 'https://www.mext.go.jp/sports/content/20220310-spt_kensport01-000020487_9.xlsx'
6cols = {'Q1': '性別', 'Q5.1': '身長', 'Q5.2': '体重'}
7df = pd.read_excel(excel_file, skiprows=[1,2], usecols=cols.keys()).set_axis(cols.values(), axis=1)
8df['性別'] = df['性別'].map({1:'男性', 2:'女性'})
9dfx = df.groupby('性別', sort=False).agg(['mean', 'median', 'std',
10                                          ('q25', lambda x: x.quantile(0.25)),
11                                          ('q75', lambda x: x.quantile(0.75))])
12
13print(dfx)
14
15#             身長                                      体重                              
16#             mean median       std    q25    q75       mean median        std   q25   q75
17# 性別                                                                                    
18# 男性  170.543111  170.0  6.641236  166.0  175.0  67.971556   66.0  12.041856  60.0  75.0
19# 女性  157.718857  158.0  5.601042  154.0  161.0  52.518857   50.0   9.020346  46.0  58.0