python数据分析panda库

panda内有两种数据结构,Series()和DataFrame()

1 >>> a=pd.Series([1,2],index=['a','b']) 
2 >>> a
3 a    1
4 b    2
5 dtype: int64
1 >>> b.index
2 RangeIndex(start=0, stop=2, step=1)
3 >>> b.values
4 array(['b', 'a'], dtype=object)
5 >>> a/2
6 a    0.5
7 b    1.0
8 dtype: float64
9 >>> 

列表切分选择

>>> s[0:3:2]
a    2
c    6
dtype: int64  
 1  s3=pd.Series(arr)  另一种方式生成series
 2 >>> s3
 3 0    1
 4 1    2
 5 2    3
 6 3    4
 7 dtype: int32
 8 >>> s3=pd.Series(s)
 9 >>> s3
10 a    2
11 b    5
12 c    6
13 d    3
14 dtype: int64
15 >>> s[s>8]
16 Series([], dtype: int64)
17 >>> s
18 a    2
19 b    5
20 c    6
21 d    3
22 dtype: int64
23 >>> s[s>3]    找出>3的元素
24 b    5
25 c    6
26 dtype: int64
27 >>> np.log(s)    对series直接运用函数
28 a    0.693147
29 b    1.609438
30 c    1.791759
31 d    1.098612
32 dtype: float64
33 >>> s.isin([5,6])     看某些元素是否在series中,boolean值
34 a    False
35 b     True
36 c     True
37 d    False
38 dtype: bool
39 >>> s[s.isin([5,6])]
40 b    5
41 c    6
42 dtype: int64
43 >>> s2=pd.Series([5,2,np.NaN,7,np.NaN])
44 >>> s2
45 0    5.0
46 1    2.0
47 2    NaN
48 3    7.0
49 4    NaN
50 dtype: float64
51 >>> s2.isnull()
52 0    False
53 1    False
54 2     True
55 3    False
56 4     True
57 dtype: bool
58 >>> s2.notnull()
59 0     True
60 1     True
61 2    False
62 3     True
63 4    False
64 dtype: bool
>>> s2[s2.isnull()]
2   NaN
4   NaN
dtype: float64

Frame的使用

 1 frame2=pd.DataFrame(fram,columns=['name','age'])
 2 >>> frame2
 3         name  age
 4 red        1    2
 5 yellow     5    6
 6 blue       9   10
 7 black     13   14
 8 >>> frame2.values
 9 array([[ 1,  2],
10        [ 5,  6],
11        [ 9, 10],
12        [13, 14]])
13 >>> frame2.index
14 Index([u'red', u'yellow', u'blue', u'black'], dtype='object')
15 >>> frame2.columns
16 Index([u'name', u'age'], dtype='object')
17 >>> frame2['name']
18 red        1
19 yellow     5
20 blue       9
21 black     13
22 Name: name, dtype: int32
23 >>> frame2.name
24 red        1
25 yellow     5
26 blue       9
27 black     13
28 Name: name, dtype: int32
29 >>> frame2.age
30 red        2
31 yellow     6
32 blue      10
33 black     14
34 Name: age, dtype: int32
35 >>> frame2[index=['red']]
>>> frame2[0:2]
        name  age
red        1    2
yellow     5    6
>>> frame2['name'][2]
9
1 >>> s.idxmin()
2 'a'
3 >>> s.idxmax9)
4 SyntaxError: invalid syntax
5 >>> s.idxmax()
6 'c'
7 >>> s.index.is_unique
8 True
>>> fram
        id  name  age  home
red      0     1    2     3
yellow   4     5    6     7
blue     8     9   10    11
black   12    13   14    15
>>> frame4=fram.drop(['name','age'],axis=1)   删除列
>>> frame4
        id  home
red      0     3
yellow   4     7
blue     8    11
black   12    15
 1 >>> f=lambda x:x.max()-x.min()   对frame运用自定义函数
 2 >>> fram.apply(f)
 3 id      12
 4 name    12
 5 age     12
 6 home    12
 7 dtype: int64
 8 >>> fram.apply(f,axis=1)
 9 red       3
10 yellow    3
11 blue      3
12 black     3
13 dtype: int64
14 >>> fram.apply(f,axis=0)
15 id      12
16 name    12
17 age     12
18 home    12
19 dtype: int64
20 >>> def f(x):
21     return pd.Series([x.min(),x.max()],index=['min','max'])
22 
23 >>> fram.apply(f)
24      id  name  age  home
25 min   0     1    2     3
26 max  12    13   14    15

  frame的一些数学统计值

 1 >>> fram.describe()
 2               id       name        age       home
 3 count   4.000000   4.000000   4.000000   4.000000
 4 mean    6.000000   7.000000   8.000000   9.000000
 5 std     5.163978   5.163978   5.163978   5.163978
 6 min     0.000000   1.000000   2.000000   3.000000
 7 25%     3.000000   4.000000   5.000000   6.000000
 8 50%     6.000000   7.000000   8.000000   9.000000
 9 75%     9.000000  10.000000  11.000000  12.000000
10 max    12.000000  13.000000  14.000000  15.000000
11 >>> fram.sum()
12 id      24
13 name    28
14 age     32
15 home    36
16 dtype: int64
17 >>> fram.mean()
18 id      6.0
19 name    7.0
20 age     8.0
21 home    9.0
22 dtype: float64
23 >>> fram.min()
24 id      0
25 name    1
26 age     2
27 home    3
28 dtype: int32