python数据分析——数据筛选实例【茅台】

import tushare as ts
import pandas as pd
from pandas import DataFrame,Series

DataFrame

- 索引:
    - df[col] df[[c1,c2]]:取列
    - df.loc[index] : 取行
    - df.loc[index,col] : 取元素
- 切片:
    - df[a:b]:切行
    - df.loc[:,a:b]:切列
- df运算:Series运算一致
- df级联:拼接

In [8]:

df = pd.read_csv('maotai.csv',index_col='date',parse_dates=['date'])
df.drop(labels='Unnamed: 0',axis=1,inplace=True)
df

Out[8]:

openclosehighlowvolumecode
date
2001-08-275.3925.5545.9025.132406318.00600519
2001-08-285.4675.7595.7815.407129647.79600519
2001-08-295.7775.6845.7815.64053252.75600519
2001-08-305.6685.7965.8605.62448013.06600519
2001-08-315.8045.7825.8775.74923231.48600519
2001-09-035.8125.7795.8705.75722112.09600519
2001-09-045.7825.8525.9495.76237006.77600519
2001-09-055.8765.8495.9245.81326066.95600519
2001-09-065.8355.7345.8545.70428997.03600519
2001-09-075.7025.5745.7735.57031552.25600519
2001-09-105.5315.7345.7575.47018878.89600519
2001-09-115.7495.6705.7735.65611390.81600519
2001-09-125.5205.6215.6565.51525045.19600519
2001-09-135.6265.6075.6715.57710986.68600519
2001-09-145.6265.6715.7025.5937672.83600519
2001-09-175.6375.5995.6705.5468983.97600519
2001-09-185.6065.6635.7105.60110773.26600519
2001-09-195.6715.7685.7685.6348650.53600519
2001-09-205.7655.7205.7885.70211173.35600519
2001-09-215.6685.6345.7185.6247879.72600519
2001-09-245.6345.6325.6855.6244068.60600519
2001-09-255.6685.6605.7095.6323488.45600519
2001-09-265.6425.6375.6995.6244956.26600519
2001-09-275.6375.7345.7775.6248778.04600519
2001-09-285.7655.7955.8125.70217088.47600519
2001-10-085.7815.7155.8095.6636552.17600519
2001-10-095.7185.8275.8595.7189558.52600519
2001-10-105.8275.6405.8485.62917548.69600519
2001-10-115.6265.5855.7025.57012306.84600519
2001-10-125.6095.6245.7265.32020010.70600519
.....................
2019-01-16658.000661.100665.650653.60042083.00600519
2019-01-17663.000659.250669.700655.80030284.00600519
2019-01-18665.000683.610690.200661.50068609.00600519
2019-01-21690.000686.980698.880681.88047051.00600519
2019-01-22686.000671.000686.000665.22042122.00600519
2019-01-23667.220665.660674.500663.13033624.00600519
2019-01-24670.300672.500677.830664.68029155.00600519
2019-01-25675.880679.700684.900669.93028986.00600519
2019-01-28684.000677.000693.980676.03036102.00600519
2019-01-29677.500685.000688.000668.00030342.00600519
2019-01-30684.990676.000686.220676.00022699.00600519
2019-01-31680.000689.600689.600677.56036696.00600519
2019-02-01697.040692.670699.000689.61030520.00600519
2019-02-11695.000725.300729.500688.18058140.00600519
2019-02-12723.000717.920725.490715.20035091.00600519
2019-02-13723.000735.250736.160717.22035677.00600519
2019-02-14731.200737.160743.660730.14032258.00600519
2019-02-15734.880724.780734.880723.02023237.00600519
2019-02-18731.000740.000744.950731.00037745.00600519
2019-02-19740.960730.760742.500726.66029501.00600519
2019-02-20735.000722.000735.000716.06037882.00600519
2019-02-21720.000720.250730.480717.00029850.00600519
2019-02-22717.000726.010728.490714.10025908.00600519
2019-02-25730.750742.330744.440717.99082622.00600519
2019-02-26747.800727.350749.180726.00057964.00600519
2019-02-27733.000740.700758.020731.00065942.00600519
2019-02-28737.900755.010763.420736.95056986.00600519
2019-03-01761.500789.300790.000761.00063840.00600519
2019-03-04795.000781.860798.330777.22084585.00600519
2019-03-05785.000779.780789.550775.88044830.00600519

4174 rows × 6 columns

In [9]:

#假如我从2010年1月1日开始,每月第一个交易日买入1手股票,每年最后一个交易日卖出所有股票,到今天为止,我的收益如何?
price_last = df['open'][-1]
df = df['2010':'2019'] #剔除首尾无用的数据
#Pandas提供了resample函数用便捷的方式对时间序列进行重采样,根据时间粒度的变大或者变小分为降采样和升采样:
df_monthly = df.resample("M").first()
df_yearly = df.resample("Y").last()[:-1] #去除最后一年
cost_money = 0
hold = 0 #每年持有的股票
for year in range(2010, 2020):
    
    cost_money -= df_monthly.loc[str(year)]['open'].sum()*100
    hold += len(df_monthly[str(year)]['open']) * 100
    if year != 2019:
        cost_money += df_yearly[str(year)]['open'][0] * hold
        hold = 0 #每年持有的股票
cost_money += hold * price_last

print(cost_money)
310250.69999999984

In [7]:

Out[7]:

openclosehighlowvolumecode
date
2010-01-04109.760108.446109.760108.04444304.88600519
2010-01-05109.116108.127109.441107.84631513.18600519
2010-01-06107.840106.417108.165106.12939889.03600519
2010-01-07106.417104.477106.691103.30248825.55600519
2010-01-08104.655103.379104.655102.16736702.09600519
2010-01-11104.400102.926105.230102.42224461.03600519
2010-01-12103.028105.708106.040102.49231063.40600519
2010-01-13104.649103.022105.389102.74137924.44600519
2010-01-14103.379107.552107.974103.37946454.64600519
2010-01-15107.533108.401110.641107.53345938.50600519
2010-01-18108.484109.110109.926108.42021461.53600519
2010-01-19109.116108.337109.441108.16517818.91600519
2010-01-20108.427105.881108.580105.80420972.95600519
2010-01-21105.842106.397107.450105.61317257.48600519
2010-01-22106.314104.738106.755103.30925432.94600519
2010-01-25104.560105.957106.761103.70423239.15600519
2010-01-26107.380106.378108.593105.03832889.16600519
2010-01-27105.951104.643107.068104.40019316.57600519
2010-01-28104.566107.974108.708104.33630267.52600519
2010-01-29108.452107.552108.612107.00437172.82600519
2010-02-01107.769107.776108.216106.57629655.94600519
2010-02-02107.208106.263108.484106.11715493.53600519
2010-02-03106.066105.887107.272104.78323034.65600519
2010-02-04105.868107.591108.006105.37622475.33600519
2010-02-05106.959109.282109.684106.57026234.30600519
2010-02-08109.282109.269112.058108.81631496.10600519
2010-02-09109.760109.193110.609108.82214151.24600519
2010-02-10109.760109.652110.137108.9316398.14600519
2010-02-11109.633110.641111.318109.50514945.05600519
2010-02-12111.018110.456111.164109.8889346.40600519
.....................
2019-01-16658.000661.100665.650653.60042083.00600519
2019-01-17663.000659.250669.700655.80030284.00600519
2019-01-18665.000683.610690.200661.50068609.00600519
2019-01-21690.000686.980698.880681.88047051.00600519
2019-01-22686.000671.000686.000665.22042122.00600519
2019-01-23667.220665.660674.500663.13033624.00600519
2019-01-24670.300672.500677.830664.68029155.00600519
2019-01-25675.880679.700684.900669.93028986.00600519
2019-01-28684.000677.000693.980676.03036102.00600519
2019-01-29677.500685.000688.000668.00030342.00600519
2019-01-30684.990676.000686.220676.00022699.00600519
2019-01-31680.000689.600689.600677.56036696.00600519
2019-02-01697.040692.670699.000689.61030520.00600519
2019-02-11695.000725.300729.500688.18058140.00600519
2019-02-12723.000717.920725.490715.20035091.00600519
2019-02-13723.000735.250736.160717.22035677.00600519
2019-02-14731.200737.160743.660730.14032258.00600519
2019-02-15734.880724.780734.880723.02023237.00600519
2019-02-18731.000740.000744.950731.00037745.00600519
2019-02-19740.960730.760742.500726.66029501.00600519
2019-02-20735.000722.000735.000716.06037882.00600519
2019-02-21720.000720.250730.480717.00029850.00600519
2019-02-22717.000726.010728.490714.10025908.00600519
2019-02-25730.750742.330744.440717.99082622.00600519
2019-02-26747.800727.350749.180726.00057964.00600519
2019-02-27733.000740.700758.020731.00065942.00600519
2019-02-28737.900755.010763.420736.95056986.00600519
2019-03-01761.500789.300790.000761.00063840.00600519
2019-03-04795.000781.860798.330777.22084585.00600519
2019-03-05785.000779.780789.550775.88044830.00600519

2221 rows × 6 columns

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tushare as ts

使用tushare包获取某股票的历史行情数据

In [14]:

df = ts.get_k_data("600519",start="1988-01-01")
df.to_csv("600519.csv")
df

Out[14]:

dateopenclosehighlowvolumecode
02001-08-275.3925.5545.9025.132406318.00600519
12001-08-285.4675.7595.7815.407129647.79600519
22001-08-295.7775.6845.7815.64053252.75600519
32001-08-305.6685.7965.8605.62448013.06600519
42001-08-315.8045.7825.8775.74923231.48600519
52001-09-035.8125.7795.8705.75722112.09600519
62001-09-045.7825.8525.9495.76237006.77600519
72001-09-055.8765.8495.9245.81326066.95600519
82001-09-065.8355.7345.8545.70428997.03600519
92001-09-075.7025.5745.7735.57031552.25600519
102001-09-105.5315.7345.7575.47018878.89600519
112001-09-115.7495.6705.7735.65611390.81600519
122001-09-125.5205.6215.6565.51525045.19600519
132001-09-135.6265.6075.6715.57710986.68600519
142001-09-145.6265.6715.7025.5937672.83600519
152001-09-175.6375.5995.6705.5468983.97600519
162001-09-185.6065.6635.7105.60110773.26600519
172001-09-195.6715.7685.7685.6348650.53600519
182001-09-205.7655.7205.7885.70211173.35600519
192001-09-215.6685.6345.7185.6247879.72600519
202001-09-245.6345.6325.6855.6244068.60600519
212001-09-255.6685.6605.7095.6323488.45600519
222001-09-265.6425.6375.6995.6244956.26600519
232001-09-275.6375.7345.7775.6248778.04600519
242001-09-285.7655.7955.8125.70217088.47600519
252001-10-085.7815.7155.8095.6636552.17600519
262001-10-095.7185.8275.8595.7189558.52600519
272001-10-105.8275.6405.8485.62917548.69600519
282001-10-115.6265.5855.7025.57012306.84600519
292001-10-125.6095.6245.7265.32020010.70600519
........................
41182018-12-07595.600587.000598.860584.70026841.00600519
41192018-12-10576.000577.230587.500574.18024640.00600519
41202018-12-11577.000582.990586.550577.00016316.00600519
41212018-12-12588.790589.000589.200582.80019921.00600519
41222018-12-13590.000601.750606.880589.60040399.00600519
41232018-12-14598.010592.960599.800592.00024554.00600519
41242018-12-17592.570590.520595.970585.01024895.00600519
41252018-12-18585.880590.940591.540585.00019222.00600519
41262018-12-19589.000577.800590.990576.03025715.00600519
41272018-12-20576.220574.500581.500568.89026940.00600519
41282018-12-21571.000567.410573.450562.21030640.00600519
41292018-12-24563.500568.000570.910558.00019109.00600519
41302018-12-25562.220565.790567.500553.61026592.00600519
41312018-12-26563.810560.080568.400558.00017866.00600519
41322018-12-27570.000563.000571.850562.22029493.00600519
41332018-12-28563.300590.010596.400560.00063678.00600519
41342019-01-02609.980598.980612.000595.01062286.00600519
41352019-01-03599.970590.000601.660585.80030977.00600519
41362019-01-04587.280602.000607.670582.02037683.00600519
41372019-01-07608.000605.490612.000602.22034750.00600519
41382019-01-08605.500604.790612.000600.28028838.00600519
41392019-01-09609.990616.120626.160609.04049892.00600519
41402019-01-10617.130618.770624.880610.25029043.00600519
41412019-01-11625.000635.880637.000618.75038884.00600519
41422019-01-14633.000624.600635.890621.55025648.00600519
41432019-01-15627.860659.980661.500625.00055764.00600519
41442019-01-16658.000661.100665.650653.60042083.00600519
41452019-01-17663.000659.250669.700655.80030284.00600519
41462019-01-18665.000683.610690.200661.50068609.00600519
41472019-01-21690.000686.980698.880681.88047051.00600519

4148 rows × 7 columns

In [15]:

df.head()

Out[15]:

dateopenclosehighlowvolumecode
02001-08-275.3925.5545.9025.132406318.00600519
12001-08-285.4675.7595.7815.407129647.79600519
22001-08-295.7775.6845.7815.64053252.75600519
32001-08-305.6685.7965.8605.62448013.06600519
42001-08-315.8045.7825.8775.74923231.48600519

In [20]:

df = pd.read_csv("600519.csv", index_col='date',parse_dates=['date'])[['open','close','high','low']]
df.head()

Out[20]:

openclosehighlow
date
2001-08-275.3925.5545.9025.132
2001-08-285.4675.7595.7815.407
2001-08-295.7775.6845.7815.640
2001-08-305.6685.7965.8605.624
2001-08-315.8045.7825.8775.749

输出该股票所有收盘比开盘上涨3%以上的日期

In [19]:

df[(df['close']-df['open'])/df['open']>=0.03].index

Out[19]:

DatetimeIndex(['2001-08-27', '2001-08-28', '2001-09-10', '2001-12-21',
               '2002-01-18', '2002-01-31', '2003-01-14', '2003-10-29',
               '2004-01-05', '2004-01-14',
               ...
               '2018-06-20', '2018-08-09', '2018-08-21', '2018-08-27',
               '2018-09-18', '2018-09-26', '2018-10-19', '2018-10-31',
               '2018-11-13', '2018-12-28'],
              dtype='datetime64[ns]', name='date', length=291, freq=None)

输出该股票所有开盘比前日收盘跌幅超过2%的日期

In [20]:

df[(df['open']-df['close'].shift(1))/df['close'].shift(1)<=-0.02].index

Out[20]:

DatetimeIndex(['2001-09-12', '2002-06-26', '2002-12-13', '2004-07-01',
               '2004-10-29', '2006-08-21', '2006-08-23', '2007-01-25',
               '2007-02-01', '2007-02-06', '2007-03-19', '2007-05-21',
               '2007-05-30', '2007-06-05', '2007-07-27', '2007-09-05',
               '2007-09-10', '2008-03-13', '2008-03-17', '2008-03-25',
               '2008-03-27', '2008-04-22', '2008-04-23', '2008-04-29',
               '2008-05-13', '2008-06-10', '2008-06-13', '2008-06-24',
               '2008-06-27', '2008-08-11', '2008-08-19', '2008-09-23',
               '2008-10-10', '2008-10-15', '2008-10-16', '2008-10-20',
               '2008-10-23', '2008-10-27', '2008-11-06', '2008-11-12',
               '2008-11-20', '2008-11-21', '2008-12-02', '2009-02-27',
               '2009-03-25', '2009-08-13', '2010-04-26', '2010-04-30',
               '2011-08-05', '2012-03-27', '2012-08-10', '2012-11-22',
               '2012-12-04', '2012-12-24', '2013-01-16', '2013-01-25',
               '2013-09-02', '2014-04-25', '2015-01-19', '2015-05-25',
               '2015-07-03', '2015-07-08', '2015-07-13', '2015-08-24',
               '2015-09-02', '2015-09-15', '2017-11-17', '2018-02-06',
               '2018-02-09', '2018-03-23', '2018-03-28', '2018-07-11',
               '2018-10-11', '2018-10-24', '2018-10-25', '2018-10-29',
               '2018-10-30'],
              dtype='datetime64[ns]', name='date', freq=None)

假如我从2010年1月1日开始,每月第一个交易日买入1手股票,每年最后一个交易日卖出所有股票,到今天为止,我的收益如何?

In [21]:

df['open'][-1]

Out[21]:

690.0

In [22]:

price_last = df['open'][-1]
df = df['2010-01':'2019-01'] #剔除首尾无用的数据
df

Out[22]:

openclosehighlow
date
2010-01-04109.760108.446109.760108.044
2010-01-05109.116108.127109.441107.846
2010-01-06107.840106.417108.165106.129
2010-01-07106.417104.477106.691103.302
2010-01-08104.655103.379104.655102.167
2010-01-11104.400102.926105.230102.422
2010-01-12103.028105.708106.040102.492
2010-01-13104.649103.022105.389102.741
2010-01-14103.379107.552107.974103.379
2010-01-15107.533108.401110.641107.533
2010-01-18108.484109.110109.926108.420
2010-01-19109.116108.337109.441108.165
2010-01-20108.427105.881108.580105.804
2010-01-21105.842106.397107.450105.613
2010-01-22106.314104.738106.755103.309
2010-01-25104.560105.957106.761103.704
2010-01-26107.380106.378108.593105.038
2010-01-27105.951104.643107.068104.400
2010-01-28104.566107.974108.708104.336
2010-01-29108.452107.552108.612107.004
2010-02-01107.769107.776108.216106.576
2010-02-02107.208106.263108.484106.117
2010-02-03106.066105.887107.272104.783
2010-02-04105.868107.591108.006105.376
2010-02-05106.959109.282109.684106.570
2010-02-08109.282109.269112.058108.816
2010-02-09109.760109.193110.609108.822
2010-02-10109.760109.652110.137108.931
2010-02-11109.633110.641111.318109.505
2010-02-12111.018110.456111.164109.888
...............
2018-12-07595.600587.000598.860584.700
2018-12-10576.000577.230587.500574.180
2018-12-11577.000582.990586.550577.000
2018-12-12588.790589.000589.200582.800
2018-12-13590.000601.750606.880589.600
2018-12-14598.010592.960599.800592.000
2018-12-17592.570590.520595.970585.010
2018-12-18585.880590.940591.540585.000
2018-12-19589.000577.800590.990576.030
2018-12-20576.220574.500581.500568.890
2018-12-21571.000567.410573.450562.210
2018-12-24563.500568.000570.910558.000
2018-12-25562.220565.790567.500553.610
2018-12-26563.810560.080568.400558.000
2018-12-27570.000563.000571.850562.220
2018-12-28563.300590.010596.400560.000
2019-01-02609.980598.980612.000595.010
2019-01-03599.970590.000601.660585.800
2019-01-04587.280602.000607.670582.020
2019-01-07608.000605.490612.000602.220
2019-01-08605.500604.790612.000600.280
2019-01-09609.990616.120626.160609.040
2019-01-10617.130618.770624.880610.250
2019-01-11625.000635.880637.000618.750
2019-01-14633.000624.600635.890621.550
2019-01-15627.860659.980661.500625.000
2019-01-16658.000661.100665.650653.600
2019-01-17663.000659.250669.700655.800
2019-01-18665.000683.610690.200661.500
2019-01-21690.000686.980698.880681.880

2195 rows × 4 columns

In [28]:

df_monthly = df.resample("M").first()
df_monthly.head()

Out[28]:

openclosehighlow
date
2010-01-31109.760108.446109.760108.044
2010-02-28107.769107.776108.216106.576
2010-03-31106.219106.085106.857105.925
2010-04-30101.324102.141102.422101.311
2010-05-3181.67682.09182.67880.974

In [29]:

df_yearly = df.resample("A").last()[:-1] #去除最后一年
df_yearly

Out[29]:

openclosehighlow
date
2010-12-31117.103118.469118.701116.620
2011-12-31138.039138.468139.600136.105
2012-12-31155.208152.087156.292150.144
2013-12-3193.18896.48097.17992.061
2014-12-31157.642161.056161.379157.132
2015-12-31207.487207.458208.704207.106
2016-12-31317.239324.563325.670317.239
2017-12-31707.948687.725716.329681.918
2018-12-31563.300590.010596.400560.000

In [31]:

df_monthly.loc[str(year)]

Out[31]:

openclosehighlow
date
2018-01-31690.200693.996700.218680.232
2018-02-28756.262747.122756.558742.379
2018-03-31717.808731.582736.394713.637
2018-04-30670.480670.539681.326664.673
2018-05-31650.760658.480659.624636.029
2018-06-30740.614734.679744.410728.417
2018-07-31734.520711.550739.330703.000
2018-08-31731.400714.940732.300714.110
2018-09-30652.000666.210667.670650.800
2018-10-31715.410686.150719.000686.150
2018-11-30555.000563.000585.500551.250
2018-12-31589.000601.200605.000584.770

In [33]:

cost_money = 0
hold = 0 #每年持有的股票
for year in range(2010, 2020):
    
    cost_money -= df_monthly.loc[str(year)]['open'].sum()*100
    hold += len(df_monthly[str(year)]['open']) * 100
    if year != 2019:
        cost_money += df_yearly[str(year)]['open'][0] * hold
        hold = 0 #每年持有的股票
    #print(cost_money)

cost_money += hold * price_last

print(cost_money)
        
# df_monthly['2001']
289604.69999999984

In [56]:

df['2017']

Out[56]:

openclosehighlow
date
2017-01-03329.299329.575331.979327.851
2017-01-04329.634346.667346.923329.615
2017-01-05344.785341.574346.213340.293
2017-01-06341.475345.534354.419340.943
2017-01-09342.618343.317347.622341.377
2017-01-10343.258343.800346.755341.436
2017-01-11342.815340.303342.815338.382
2017-01-12341.386341.879342.224339.377
2017-01-13341.810339.731342.214338.756
2017-01-16339.003336.382339.663333.752
2017-01-17337.495343.928346.263336.904
2017-01-18343.682349.789351.454342.037
2017-01-19349.711349.435353.139346.460
2017-01-20349.612349.701352.075347.770
2017-01-23351.563345.987354.439344.253
2017-01-24346.036345.090347.543341.583
2017-01-25344.765342.450344.765341.042
2017-01-26342.953339.288343.603336.215
2017-02-03340.845341.682343.977339.860
2017-02-06343.298341.682343.692339.820
2017-02-07341.672338.461342.224338.027
2017-02-08338.638339.081340.352337.564
2017-02-09339.860342.431343.593339.475
2017-02-10343.426339.938344.608339.860
2017-02-13338.874344.539346.657336.825
2017-02-14344.332344.992347.346343.898
2017-02-15346.263343.337347.189342.027
2017-02-16343.307342.904343.426340.362
2017-02-17343.701345.337349.159342.322
2017-02-20345.189355.907356.921344.795
...............
2017-10-20581.000574.330581.000571.760
2017-10-23574.920573.410580.500568.000
2017-10-24572.000567.330573.800563.610
2017-10-25569.750565.670571.680562.060
2017-10-26605.000605.090613.000591.910
2017-10-27602.110649.630655.000600.030
2017-10-30643.050622.080644.000621.000
2017-10-31622.010618.030627.600611.880
2017-11-01620.880623.010631.540619.220
2017-11-02626.950626.920629.780621.000
2017-11-03628.650639.170643.440628.650
2017-11-06641.500653.060657.620640.200
2017-11-07650.580642.070657.150638.180
2017-11-08642.330650.380654.500642.330
2017-11-09648.000650.070654.150644.610
2017-11-10650.430677.950683.400650.430
2017-11-13680.000687.880692.780675.800
2017-11-14687.140678.750687.220677.350
2017-11-15678.750688.080689.990674.000
2017-11-16686.010719.110719.960686.010
2017-11-17696.000690.250709.000677.770
2017-11-20676.800679.150685.600660.000
2017-11-21669.000677.250688.800662.020
2017-11-22677.000650.520688.000650.000
2017-11-23642.000633.730646.000630.220
2017-11-24635.900630.040642.500622.230
2017-11-27634.000621.290636.330613.010
2017-11-28625.060648.230648.880624.000
2017-11-29649.940638.120651.970634.010
2017-11-30635.510631.000648.480626.000

223 rows × 4 columns