数字货币量化系统之利用 Pandas 处理时序数据

利用Pandas处理时序数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

1. 时间索引

1.1 Python时间数据的基本操作

from datetime import datetime

cur_year = 2018
cur_month = 9
cur_day = 27
cur_hour = 11
cur_min = 11
cur_sec = 15
cur_date = datetime(cur_year, cur_month, cur_day)
cur_date
datetime.datetime(2018, 9, 27, 0, 0)
cur_date_time = datetime(cur_year, cur_month, cur_day, cur_hour, cur_min, cur_sec)
cur_date_time
datetime.datetime(2018, 9, 27, 11, 11, 15)
type(cur_date_time)
datetime.datetime
cur_date_time.month
9
now = datetime.now()
print(now)
print('{}年{}月{}日'.format(now.year, now.month, now.day))
2019-12-03 00:57:24.380933
2019年12月3日
diff = datetime.now() - datetime(2018, 1, 1)
print(type(diff))
print(diff)
print('经历了{}天, {}秒。'.format(diff.days, diff.seconds))
<class 'datetime.timedelta'>
701 days, 0:57:24.388962
经历了701天, 3444秒。

1.2 字符串和datetime转换

# str()
dt_obj = datetime.now()
str_obj = str(dt_obj)
print(type(str_obj))
print(str_obj)
<class 'str'>
2019-12-03 00:57:24.398306
# datetime.strftime()
str_obj2 = dt_obj.strftime('%d/%m/%Y')
print(str_obj2)
03/12/2019
# strptime
dt_str = '2018-01-1'
dt_obj2 = datetime.strptime(dt_str, '%Y-%m-%d')
print(type(dt_obj2))
print(dt_obj2)
<class 'datetime.datetime'>
2018-01-01 00:00:00

1.3 Pandas时间索引操作

date_list = [datetime(2018, 9, 1), datetime(2018, 9, 2), datetime(2018, 9, 3) ]
dt_idx = pd.DatetimeIndex(date_list)
dt_idx
DatetimeIndex(['2018-09-01', '2018-09-02', '2018-09-03'], dtype='datetime64[ns]', freq=None)
data = np.random.rand(3, 2)
df = pd.DataFrame(data, index=dt_idx, columns=['a', 'b'])
df
a b
2018-09-01 0.414071 0.950360
2018-09-02 0.593607 0.719602
2018-09-03 0.272688 0.092864
# 传入开始、结束日期,默认生成的该时间段的时间点是按天计算的
date_index = pd.date_range('2018/01/01', '2018/01/10')
print(date_index)
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10'],
              dtype='datetime64[ns]', freq='D')
# 只传入开始或结束日期,还需要传入时间段
print(pd.date_range(start='2018/01/01', periods=10, freq='4D'))
DatetimeIndex(['2018-01-01', '2018-01-05', '2018-01-09', '2018-01-13',
               '2018-01-17', '2018-01-21', '2018-01-25', '2018-01-29',
               '2018-02-02', '2018-02-06'],
              dtype='datetime64[ns]', freq='4D')
print(pd.date_range(end='2018/03/31', periods=10))
DatetimeIndex(['2018-03-22', '2018-03-23', '2018-03-24', '2018-03-25',
               '2018-03-26', '2018-03-27', '2018-03-28', '2018-03-29',
               '2018-03-30', '2018-03-31'],
              dtype='datetime64[ns]', freq='D')

2. 重采样

df = pd.read_csv('./btc_usdt_day_price.csv')
df.head()
datetime open high low close vol symbol
0 2017-10-10 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT
1 2017-10-11 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2 2017-10-12 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
3 2017-10-13 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
4 2017-10-14 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 7 columns):
datetime    783 non-null object
open        783 non-null float64
high        783 non-null float64
low         783 non-null float64
close       783 non-null float64
vol         783 non-null float64
symbol      783 non-null object
dtypes: float64(5), object(2)
memory usage: 42.9+ KB
# 将时间字符串转为时间索引
df['datetime'] = pd.to_datetime(df['datetime'])
df.head()
datetime open high low close vol symbol
0 2017-10-10 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT
1 2017-10-11 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2 2017-10-12 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
3 2017-10-13 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
4 2017-10-14 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783 entries, 0 to 782
Data columns (total 7 columns):
datetime    783 non-null datetime64[ns]
open        783 non-null float64
high        783 non-null float64
low         783 non-null float64
close       783 non-null float64
vol         783 non-null float64
symbol      783 non-null object
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 42.9+ KB
# 替换索引为时间
df.set_index('datetime', inplace=True)
df.head()
open high low close vol symbol
datetime
2017-10-10 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT
2017-10-11 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2017-10-12 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
2017-10-13 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
2017-10-14 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT
# 在加载数据的时候直接指定索引,并对时间字符串改为datetime
df2 = pd.read_csv('./btc_usdt_day_price.csv', index_col='datetime', parse_dates=True)
df2.index
DatetimeIndex(['2017-10-10 16:00:00', '2017-10-11 16:00:00',
               '2017-10-12 16:00:00', '2017-10-13 16:00:00',
               '2017-10-14 16:00:00', '2017-10-15 16:00:00',
               '2017-10-16 16:00:00', '2017-10-17 16:00:00',
               '2017-10-18 16:00:00', '2017-10-19 16:00:00',
               ...
               '2019-11-22 16:00:00', '2019-11-23 16:00:00',
               '2019-11-24 16:00:00', '2019-11-25 16:00:00',
               '2019-11-26 16:00:00', '2019-11-27 16:00:00',
               '2019-11-28 16:00:00', '2019-11-29 16:00:00',
               '2019-11-30 16:00:00', '2019-12-01 16:00:00'],
              dtype='datetime64[ns]', name='datetime', length=783, freq=None)

重采样

file

# 重采样
df.resample(rule='A').mean()
open high low close vol
datetime
2017-12-31 9730.795471 10361.618705 9193.040764 9889.569587 7660.869604
2018-12-31 7533.079686 7766.355870 7252.600444 7507.197731 37016.517370
2019-12-31 7363.794578 7563.743975 7166.226017 7374.493539 59212.948900
# 季度均值
df.resample(rule='Q').mean()
open high low close vol
datetime
2017-12-31 9730.795471 10361.618705 9193.040764 9889.569587 7660.869604
2018-03-31 10426.336880 10913.220420 9813.952741 10352.189024 51095.669267
2018-06-30 7734.802413 7931.319060 7505.682956 7733.718638 30939.826391
2018-09-30 6797.420514 6945.139676 6641.490180 6799.922108 31890.588984
2018-12-31 5238.848471 5345.947500 5107.708367 5207.270975 34380.002370
2019-03-31 3752.757594 3808.353684 3702.755729 3757.914841 44113.964987
2019-06-30 7255.041758 7528.605495 7049.604396 7323.457143 116866.322554
2019-09-30 10353.392391 10622.878261 10021.864130 10331.159783 38738.919921
2019-12-31 8329.066129 8527.330645 8127.616129 8311.962903 26891.371736
def first_day(entry):
    return entry[0]

df.resample('A').apply(first_day)
open high low close vol symbol
datetime
2017-12-31 1.0000 4901.0000 1.0000 4901.0000 19.260000 BTC/USDT
2018-12-31 13167.6301 13907.7498 12976.6766 13652.8499 23734.687134 BTC/USDT
2019-12-31 3682.0813 3843.4454 3638.0101 3802.6414 41686.944845 BTC/USDT
# 在pycharm,需要调用show才能展示
# plt.show(df['close'].resample('Q').mean().plot(kind='bar', figsize=(8, 5)))

df['close'].resample('Q').mean().plot(kind='bar', figsize=(8, 5))

file

df['close'].resample('W').mean().plot(kind='line', figsize=(8, 5))

file

3. 平移操作

df.head()
open high low close vol symbol
datetime
2017-10-10 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT
2017-10-11 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2017-10-12 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
2017-10-13 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
2017-10-14 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT
# 向下移动
df.shift(periods=1).head()
open high low close vol symbol
datetime
2017-10-10 16:00:00 NaN NaN NaN NaN NaN NaN
2017-10-11 16:00:00 1.0 4901.00 1.00 4901.00 19.260000 BTC/USDT
2017-10-12 16:00:00 4901.0 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2017-10-13 16:00:00 4989.0 5922.00 4989.00 5741.70 20.739843 BTC/USDT
2017-10-14 16:00:00 5741.7 5849.95 5473.77 5849.95 19.259580 BTC/USDT
# 向上移动
df.shift(periods=-1).head()
open high low close vol symbol
datetime
2017-10-10 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2017-10-11 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
2017-10-12 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
2017-10-13 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT
2017-10-14 16:00:00 5848.00 5848.00 5848.00 5848.00 0.000000 BTC/USDT
df.tshift(freq='W').head()
open high low close vol symbol
datetime
2017-10-15 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT
2017-10-15 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT
2017-10-15 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT
2017-10-15 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT
2017-10-15 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT

4. 滚动统计

df['close'].plot(figsize=(16, 6))

file

df['close'].rolling(window=7).mean().head(20)
datetime
2017-10-10 16:00:00            NaN
2017-10-11 16:00:00            NaN
2017-10-12 16:00:00            NaN
2017-10-13 16:00:00            NaN
2017-10-14 16:00:00            NaN
2017-10-15 16:00:00            NaN
2017-10-16 16:00:00    5539.432857
2017-10-17 16:00:00    5576.290000
2017-10-18 16:00:00    5667.861429
2017-10-19 16:00:00    5690.478571
2017-10-20 16:00:00    5719.061429
2017-10-21 16:00:00    5738.995714
2017-10-22 16:00:00    5716.424286
2017-10-23 16:00:00    5733.778571
2017-10-24 16:00:00    5761.780000
2017-10-25 16:00:00    5803.208571
2017-10-26 16:00:00    5788.918571
2017-10-27 16:00:00    5713.458571
2017-10-28 16:00:00    5700.952857
2017-10-29 16:00:00    5751.134286
Name: close, dtype: float64
df['close'].plot()
df['close'].rolling(window=7).mean().plot(figsize=(16, 6))

file

df['close'].plot()
df['close'].rolling(window=7, center=True).mean().plot(figsize=(16, 6))

file

df['close_MA5'] = df['close'].rolling(window=5, center=True).mean()
df['close_MA10'] = df['close'].rolling(window=10, center=True).mean()
df['close_MA30'] = df['close'].rolling(window=30, center=True).mean()
df.head()
open high low close vol symbol close_MA5 close_MA10 close_MA30
datetime
2017-10-10 16:00:00 1.00 4901.00 1.00 4901.00 19.260000 BTC/USDT NaN NaN NaN
2017-10-11 16:00:00 4901.00 4999.00 4790.00 4989.00 0.580096 BTC/USDT NaN NaN NaN
2017-10-12 16:00:00 4989.00 5922.00 4989.00 5741.70 20.739843 BTC/USDT 5465.930 NaN NaN
2017-10-13 16:00:00 5741.70 5849.95 5473.77 5849.95 19.259580 BTC/USDT 5655.330 NaN NaN
2017-10-14 16:00:00 5849.95 5849.95 5686.41 5848.00 8.658607 BTC/USDT 5777.206 NaN NaN
df[['close', 'close_MA5', 'close_MA10', 'close_MA30']].plot(figsize=(16, 6))

file

完整代码

bitLearn/pandas_time_series.py

# -*- coding:utf-8 -*-

"""
@author: Corwien
@file: pandas_time_series.py
@time: 19/12/2 22:24
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 如果数据多,会自动隐藏,这里设置不自动隐藏
pd.set_option('expand_frame_repr', False)

# 1、时间索引
# 1.1 Python时间数据的基本操作
from datetime import datetime

cur_year = 2019
cur_month = 12
cur_day = 1
cur_hour = 14
cur_min = 11
cur_sec = 55

cur_date = datetime(cur_year, cur_month, cur_day)
print(cur_date)

cur_date_time = datetime(cur_year, cur_month, cur_day, cur_hour, cur_min, cur_sec)
print(cur_date_time)

this_month = cur_date_time.month
print(this_month)

# 获取当前时间
now = datetime.now()
print(now)
print('{}年{}月{}日'.format(now.year, now.month, now.day))

# 计算时间差
diff = datetime.now() - datetime(2018, 5, 23)
print(type(diff))
print(diff)
print('经历了{}天,{}秒。'.format(diff.days, diff.seconds))

# 1.2 字符串和datetime转换
dt_obj = datetime.now()
str_obj = str(dt_obj)
print(type(str_obj))
print(str_obj)

# datetime.strftime()
str_obj2 = dt_obj.strftime('%d/%m/%Y')
print(str_obj2)

# strptime
dt_str = '2019-02-14 00:00:00'
dt_obj2 = datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')
print(type(dt_obj2))
print(dt_obj2)

# 1.3 Pandas时间索引操作
date_list = [datetime(2018,9,1), datetime(2018,9,2), datetime(2018,9,3)]
dt_idx = pd.DatetimeIndex(date_list)
print(dt_idx)

data = np.random.rand(3, 2)
df = pd.DataFrame(data, index=dt_idx, columns=['a', 'b'])
print(df)

# 传入开始、结束日期,默认生成的该时间段的时间点是按天计算的
date_index = pd.date_range('2019/09/01', '2019/09/10')
print(date_index)

# 只传入开始或结束日期,还需要传入时间段
print(pd.date_range(start='2019/11/15', periods=10, freq='2D'))

print(pd.date_range(end='2019/12/01', periods=10))

print('========= 2.重采样 ======\n')
# 2.重采样
df = pd.read_csv('./btc_sample.csv')
head_5 = df.head(5)
print(head_5)

df.info()

# 将字符串转为时间索引
df['datetime'] = pd.to_datetime(df['datetime'])
print(df.head())
print(df.info())

df.set_index('datetime', inplace=True)
print(df.head(5))

# 在加载数据的时候直接指定索引,并对时间字符串改为datetime
df2 = pd.read_csv('./btc_sample.csv', index_col='datetime', parse_dates=True)
print(df2.index)

# 重采样
res_mean = df.resample(rule='A').mean()
print(res_mean)

# 季度均值
res_mean_q = df.resample(rule='Q').mean()
print(res_mean_q)

def first_day(entry):
    return entry[0]

print(df.resample('A').apply(first_day))

# 画图,直方图
# df['close'].resample('Q').mean().plot(kind='bar', figsize=(8, 5)) # 不显示图片
# 在pycharm,需要调用show才能展示
# plt.show(df['close'].resample('Q').mean().plot(kind='bar', figsize=(8, 5)))

# K线图
# plt.show(df['close'].resample('W').mean().plot(kind='line', figsize=(8,5)))

# 平移操作
print(df.head())
print(df.shift(periods=1).head())

# 4、滚动统计
# plt.show(df['close'].plot(figsize=(18, 6)))

# 7日均线(这种写法可以使两个K线图画在一起,方便对比)
print(df['close'].rolling(window=7).mean().head(20))
#df['close'].plot(label='每日收盘价')
#df['close'].rolling(window=7).mean().plot(figsize=(16, 6), label='7日均价')
#plt.legend()  # 显示上面的label
#plt.title('BTC ROLLING TEST')  # 添加标题
#plt.show()

# df['close'].plot()
# df['close'].rolling(window=7, center=True).mean().plot(figsize=(16, 6))
# plt.show()

# 均线图
df['close_MA5'] = df['close'].rolling(window=5, center=True).mean()
df['close_MA10'] = df['close'].rolling(window=10, center=True).mean()
df['close_MA30'] = df['close'].rolling(window=30, center=True).mean()

df[['close', 'close_MA5', 'close_MA10', 'close_MA30']].plot(figsize=(16, 6))
plt.show()

为者常成,行者常至