Python 数据处理

1.数据清洗、采样、特征构造

a.Regex处理

原始数据文件 a.csv:

1
2
3
4
5
6
7
8
9
10
11
12
13
Name,Age
John,25
Jane,
Mike,30
Alice,thirty
Bob,40
wang,56
li,84
zhang,78
liu,21
mei,5
fu,9
zhu,45

re 处理:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import csv
import re

# 读取 CSV 文件并清洗数据
def clean_data(file_name):
cleaned_data = []
unique_rows = set()
with open(file_name, 'r') as f:
reader = csv.DictReader(f)
for row in reader:
if row['Name'] and row['Age']:
if re.match(r'^\d+$', row['Age']):
row_str = str(row)
if row_str not in unique_rows:
unique_rows.add(row_str)
cleaned_data.append(row)
return cleaned_data

# 将清洗后的数据写入新的 CSV 文件
def write_cleaned_data(file_name, cleaned_data):
with open(file_name, 'w') as f:
writer = csv.DictWriter(f, fieldnames=['Name', 'Age'])
writer.writeheader()
for row in cleaned_data:
writer.writerow(row)

# 测试代码
if __name__ == '__main__':
data = clean_data('data.csv')
write_cleaned_data('cleaned_data.csv', data)

处理后的文件 cleaned_data.csv:

1
2
3
4
5
6
7
8
9
10
11
12
Name,Age,Age Group
zhu,45,Over 18
wang,56,Over 18
zhang,78,Over 18
mei,5,Under 18
li,84,Over 18
Bob,40,Over 18
liu,21,Over 18
fu,9,Under 18
Mike,30,Over 18
John,25,Over 18

b.panda

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd
import re

# 读取 CSV 文件并清洗数据
def clean_data(file_name):
df = pd.read_csv(file_name)
df = df.dropna(subset=['Name', 'Age'])
df = df[df['Age'].apply(lambda x: re.match(r'^\d+$', str(x)) is not None)]
return df

# 数据采样
def sample_data(df, n=100):
return df.sample(n)

# 特征构造
def feature_engineering(df):
df['Age_group'] = pd.cut(df['Age'], bins=[0, 18, 35, 50, 100], labels=['<18', '18-35', '35-50', '>50'])
return df

# 将清洗后的数据写入新的 CSV 文件
def write_cleaned_data(file_name, df):
df.to_csv(file_name, index=False)

# 测试代码
if __name__ == '__main__':
# 读取原始数据并清洗
df = clean_data('data.csv')

# 数据采样
df_sampled = sample_data(df, n=100)

# 特征构造
df_engineered = feature_engineering(df_sampled)

# 将清洗后的数据写入新的 CSV 文件
write_cleaned_data('cleaned_data.csv', df_engineered)

# 打印采样后的数据和构造后的特征
print('Sampled Data:')
print(df_sampled.head())
print('\nEngineered Features:')
print(df_engineered.head())