Pandas数据分析基础

Pandas模块学习

Pandas基本介绍

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import pandas as pd

s = pd.Series([1,3,4,6,np.nan,44,1])
print(s)
#0 1.0
#1 3.0
#2 4.0
#3 6.0
#4 NaN
#5 44.0
#6 1.0

dates = pd.date_range('20200202',periods=6)
print(dates) #DatetimeIndex(['2020-02-02', '2020-02-03', '2020-02-04', '2020-02-05',
# '2020-02-06', '2020-02-07'],

df = pd.DataFrame(np.random.rand(6,4),index=dates,columns=['a','b','c','d']) #index指每行的坐标,也就是纵坐标,column是横坐标
print(df)

# a b c d
# 2020-02-02 0.640585 0.969161 0.367313 0.680238
# 2020-02-03 0.619822 0.430586 0.451134 0.365451
# 2020-02-04 0.460065 0.730973 0.540782 0.844242
# 2020-02-05 0.154691 0.324446 0.945344 0.221274
# 2020-02-06 0.584177 0.865024 0.226741 0.834784
# 2020-02-07 0.501330 0.016878 0.927609 0.128724

df_2 = pd.DataFrame(np.arange(12).reshape((3,4))) #不加index和columns默认是0,1,2,3,4作为坐标
print(df_2)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11

df_3 = pd.DataFrame({'A':1.0,'B':pd.Timestamp('20130102'),'C':pd.Series(1,list(range(4)),dtype='float32')})
print(df_3)
# A B C
# 0 1.0 2013-01-02 1.0
# 1 1.0 2013-01-02 1.0
# 2 1.0 2013-01-02 1.0
# 3 1.0 2013-01-02 1.0
print(df_3.dtypes)
# A float64
# B datetime64[ns]
# C float32
# dtype: object
print(df_3.index) #Int64Index([0, 1, 2, 3], dtype='int64')
print(df_3.columns) #Index(['A', 'B', 'C'], dtype='object')
print(df_3.describe())
# A C
# count 4.0 4.0
# mean 1.0 1.0
# std 0.0 0.0
# min 1.0 1.0
# 25% 1.0 1.0
# 50% 1.0 1.0
# 75% 1.0 1.0
# max 1.0 1.0
print(df_3.T) #转置
# 0 ... 3
# A 1 ... 1
# B 2013-01-02 00:00:00 ... 2013-01-02 00:00:00
# C 1 ... 1
print(df_3.sort_index(axis=1,ascending=False)) #对行列进行排序 .sort_index(axis=行还是列,ascending=是否排序成增序列)
# C B A
# 0 1.0 2013-01-02 1.0
# 1 1.0 2013-01-02 1.0
# 2 1.0 2013-01-02 1.0
# 3 1.0 2013-01-02 1.0

print(df_3.sort_values(by='E')) #按照某一列对行进行排序

Pandas 选择数据

请注意:在pandas版本0.20.0及其以后版本中,ix已经不被推荐使用,建议采用iloc和loc实现ix。这是为什么呢?这是由于ix的复杂特点可能使ix使用起来有些棘手:

如果索引是整数类型,则ix将仅使用基于标签的索引,而不会回退到基于位置的索引。如果标签不在索引中,则会引发错误。
如果索引不仅包含整数,则给定一个整数,ix将立即使用基于位置的索引而不是基于标签的索引。但是,如果ix被赋予另一种类型(例如字符串),则它可以使用基于标签的索引。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2013-01-01 0 1 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11
# 2013-01-04 12 13 14 15
# 2013-01-05 16 17 18 19
# 2013-01-06 20 21 22 23


print(df['A']) #df.A
# 2013-01-01 0
# 2013-01-02 4
# 2013-01-03 8
# 2013-01-04 12
# 2013-01-05 16
# 2013-01-06 20
print(df[0:3]) #0-3行 也可以用df['20130101':'20130103']
# A B C D
# 2013-01-01 0 1 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11

print(df.loc['20130103']) #选出特定的一行 .loc()是根据数字标签label来筛选
# A 8
# B 9
# C 10
# D 11
# Name: 2013-01-03 00:00:00, dtype: int32
print(df.loc[:, ['A', 'B']]) #选出特定的一列
# A B
# 2013-01-01 0 1
# 2013-01-02 4 5
# 2013-01-03 8 9
# 2013-01-04 12 13
# 2013-01-05 16 17
# 2013-01-06 20 21

print(df.loc['20130103':'20130105', ['A', 'B']]) #特定的行和列
# A B
# 2013-01-03 8 9
# 2013-01-04 12 13
# 2013-01-05 16 17

print(df.iloc[1:3, 0:1]) #不按照行列的名字,而是按照行数和列数的index来筛选 (注意:左闭右开)
# A
# 2013-01-02 4
# 2013-01-03 8
print(df.iloc[[1, 3, 5], 1:3])
# B C
# 2013-01-02 5 6
# 2013-01-04 13 14
# 2013-01-06 21 22
print(df.iloc[range(1, 3), 1:3])


#请注意:在pandas版本0.20.0及其以后版本中,ix已经不被推荐使用,建议采用iloc和loc实现ix。这是为什么呢?这是由于ix的复杂特点可能使ix使用起来有些棘手:
#如果索引是整数类型,则ix将仅使用基于标签的索引,而不会回退到基于位置的索引。如果标签不在索引中,则会引发错误。
#如果索引不仅包含整数,则给定一个整数,ix将立即使用基于位置的索引而不是基于标签的索引。但是,如果ix被赋予另一种类型(例如字符串),则它可以使用基于标签的索引。
#print(df._ix[:3, ['A', 'B']])

print(df[df.A>8])
# A B C D
# 2013-01-04 12 13 14 15
# 2013-01-05 16 17 18 19
# 2013-01-06 20 21 22 23

Pandas设置值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2013-01-01 0 1 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 9 10 11
# 2013-01-04 12 13 14 15
# 2013-01-05 16 17 18 19
# 2013-01-06 20 21 22 23

#更改DataFrame中的值
df.iloc[2,2] = 111
df.loc['20130101','B'] = 222
#df[df.A>4] = 0 #df.A>4的那行为赋值为0
df.B[df.A>4] = 0 #只对一列操作
# A B C D
# 2013-01-01 0 222 2 3
# 2013-01-02 4 5 6 7
# 2013-01-03 8 0 111 11
# 2013-01-04 12 0 14 15
# 2013-01-05 16 0 18 19
# 2013-01-06 20 0 22 23

#增加series
df['E'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('20130101',periods=6))
print(df)
# A B C D E
# 2013-01-01 0 222 2 3 1
# 2013-01-02 4 5 6 7 2
# 2013-01-03 8 0 111 11 3
# 2013-01-04 12 0 14 15 4
# 2013-01-05 16 0 18 19 5
# 2013-01-06 20 0 22 23 6

Pandas处理丢失数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import pandas as pd
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
# A B C D
# 2013-01-01 0 NaN 2.0 3
# 2013-01-02 4 5.0 NaN 7
# 2013-01-03 8 9.0 10.0 11
# 2013-01-04 12 13.0 14.0 15
# 2013-01-05 16 17.0 18.0 19
# 2013-01-06 20 21.0 22.0 23

#drop none value data
print(df.dropna(axis = 0,how = 'any'))
#axis=0丢掉行,axis=1丢掉列 #how='any'代表只要某一行(列)有NaN就删除该行(列) how='all'代表全是NaN才删除

# A B C D
# 2013-01-03 8 9.0 10.0 11
# 2013-01-04 12 13.0 14.0 15
# 2013-01-05 16 17.0 18.0 19
# 2013-01-06 20 21.0 22.0 23


#fill none value data 填入数据
print(df.fillna(value=123))
# A B C D
# 2013-01-01 0 123.0 2.0 3
# 2013-01-02 4 5.0 123.0 7
# 2013-01-03 8 9.0 10.0 11
# 2013-01-04 12 13.0 14.0 15
# 2013-01-05 16 17.0 18.0 19
# 2013-01-06 20 21.0 22.0 23
print(df.isnull())
# A B C D
# 2013-01-01 False True False False
# 2013-01-02 False False True False
# 2013-01-03 False False False False
# 2013-01-04 False False False False
# 2013-01-05 False False False False
# 2013-01-06 False False False False

#是否有数据丢失
print(np.any(df.isnull()==True))

Pandas导入导出数据

1
2
3
4
5
6
7
#import numpy as np
import pandas as pd

data = pd.read_csv('2.csv')
print(data)

data.to_pickle('22.pickle')

Pandas合并concat

.concat()合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# a b c d
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# a b c d
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
res = pd.concat([df1, df2, df3],axis=0) #竖向合并
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0

res2 = pd.concat([df1,df2,df3], axis=0, ignore_index=True) #合并后重新给纵坐标赋值
print(res2)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])
print(df1)
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
print(df2)
# b c d e
# 2 1.0 1.0 1.0 1.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
res = pd.concat([df1,df2])
print(res)
# a b c d e
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 0.0 0.0 0.0 0.0 NaN
# 2 NaN 1.0 1.0 1.0 1.0
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0

res = pd.concat([df1,df2],join='inner',axis=0) #inner 合并共有的序列
#要是想处理序号就res = pd.concat([df1,df2],join='inner',ignore_index=true,axis=0)
print(res)
# b c d
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 0.0 0.0 0.0
# 2 1.0 1.0 1.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0


#在表后添加一行
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
t = df1.append(s1,ignore_index=True)
print(t)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 2.0 3.0 4.0

Pandas合并merge

基于一个key合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import numpy as np
import pandas as pd

left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3

res = pd.merge(left,right,on='key') #基于key合并
print(res)
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3

基于多个key合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd

left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3

res = pd.merge(left,right,on=['key1','key2'])
#观察不能发现,只有两个key都满足相等时才会合并,且在一堆多的情况下会分别合并,我们默认这种合并为how='inner'
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2

res = pd.merge(left,right,on=['key1','key2'],how='outer')
#不管有没有都合并,空的地方填入NaN
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
# 5 K2 K0 NaN NaN C3 D3

res = pd.merge(left,right,on=['key1','key2'],how='right')
#基于某个DataFrame进行合并,另一个有对应的key相等就合并,没有就赋值NaN
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# 3 K2 K0 NaN NaN C3 D3

res = pd.merge(left,right,on=['key1','key2'],how='right',indicator=True)
#indicator=True 显示合并方式
print(res)
# key1 key2 A B C D _merge
# 0 K0 K0 A0 B0 C0 D0 both
# 1 K1 K0 A2 B2 C1 D1 both
# 2 K1 K0 A2 B2 C2 D2 both
# 3 K2 K0 NaN NaN C3 D3 right_only

下面是按照index来合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import numpy as np
import pandas as pd

left = pd.DataFrame({'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']},
index=['K0','K1','K2','K3'])
right = pd.DataFrame({'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']},
index=['K1','K2','K3','K4'])
print(left)
print(right)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
# K3 A3 B3
# C D
# K1 C0 D0
# K2 C1 D1
# K3 C2 D2
# K4 C3 D3
print(pd.merge(left,right,left_index=True,right_index=True,how='outer'))
#按照index来合并
# A B C D
# K0 A0 B0 NaN NaN
# K1 A1 B1 C0 D0
# K2 A2 B2 C1 D1
# K3 A3 B3 C2 D2
# K4 NaN NaN C3 D3

suffixes参数的作用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import numpy as np
import pandas as pd

boy = pd.DataFrame({'k':['K0','K1','K2'],
'age':[1,2,3]})
girl = pd.DataFrame({'k':['K0','K1','K2'],
'age':[4,5,6]})

print(boy)
print(girl)
# k age
# 0 K0 1
# 1 K1 2
# 2 K2 3
# k age
# 0 K0 4
# 1 K1 5
# 2 K2 6
res = pd.merge(boy,girl,on='k',suffixes=['_boy','_girl'],how='inner')
print(res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K1 2 5
# 2 K2 3 6

本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!