把日期和时间拆成多个特征# 加载库import pandas as pd# 创建数据帧df = pd.DataFrame()# 创建五个日期df['date'] = pd.date_range('1/1/2001', periods=150, freq='W')# 为年月日,时分秒创建特征df['year'] = df['date'].dt.yeardf['month'] = df['date'].dt.monthdf['day'] = df['date'].dt.daydf['hour'] = df['date'].dt.hourdf['minute'] = df['date'].dt.minute# 展示三行df.head(3)dateyearmonthdayhourminute
02001-01-0720011700
12001-01-14200111400
22001-01-21200112100计算日期时间之间的差# 加载库import pandas as pd# 创建数据帧df = pd.DataFrame()# 创建两个 datetime 特征df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]# 计算特征之间的间隔df['Left'] - df['Arrived']'''0 0 days1 2 daysdtype: timedelta64[ns] '''# 计算特征之间的间隔pd.Series(delta.days for delta in (df['Left'] - df['Arrived']))'''0 01 2dtype: int64 '''将字符串转换为日期# 加载库import numpy as npimport pandas as pd# 创建字符串date_strings = np.array(['03-04-2005 11:35 PM', '23-05-2010 12:01 AM', '04-09-2009 09:09 PM'])如果errors="coerce"那么任何问题都不会产生错误(默认行为),而是将导致错误的值设置为NaT(即缺失值)。 代码描述示例
%Y整年2001
%m零填充的月份04
%d零填充的日期09
%I零填充的小时(12 小时)02
%pAM 或 PMAM
%M零填充的分钟05
%S零填充的秒钟09# 转换为 datetime[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce") for date in date_strings]'''[Timestamp('2005-04-03 23:35:00'), Timestamp('2010-05-23 00:01:00'), Timestamp('2009-09-04 21:09:00')] '''转换 pandas 列的时区# 加载库import pandas as pdfrom pytz import all_timezones# 展示十个时区all_timezones[0:10]'''['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau'] '''# 创建十个日期dates = pd.Series(pd.date_range('2/2/2002', periods=10, freq='M'))# 设置时区dates_with_abidjan_time_zone = dates.dt.tz_localize('Africa/Abidjan')# 查看 pandas 序列dates_with_abidjan_time_zone'''0 2002-02-28 00:00:00+00:001 2002-03-31 00:00:00+00:002 2002-04-30 00:00:00+00:003 2002-05-31 00:00:00+00:004 2002-06-30 00:00:00+00:005 2002-07-31 00:00:00+00:006 2002-08-31 00:00:00+00:007 2002-09-30 00:00:00+00:008 2002-10-31 00:00:00+00:009 2002-11-30 00:00:00+00:00dtype: datetime64[ns, Africa/Abidjan] '''# 转换时区dates_with_london_time_zone = dates_with_abidjan_time_zone.dt.tz_convert('Europe/London')# 查看 pandas 序列dates_with_london_time_zone'''0 2002-02-28 00:00:00+00:001 2002-03-31 00:00:00+00:002 2002-04-30 01:00:00+01:003 2002-05-31 01:00:00+01:004 2002-06-30 01:00:00+01:005 2002-07-31 01:00:00+01:006 2002-08-31 01:00:00+01:007 2002-09-30 01:00:00+01:008 2002-10-31 00:00:00+00:009 2002-11-30 00:00:00+00:00dtype: datetime64[ns, Europe/London] '''编码星期# 加载库import pandas as pd# 创建数据集dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))# 查看数据dates'''0 2002-02-281 2002-03-312 2002-04-30dtype: datetime64[ns] '''# 查看星期dates.dt.weekday_name'''0 Thursday1 Sunday2 Tuesdaydtype: object '''处理时间序列中的缺失值# 加载库import pandas as pdimport numpy as np# 创建日期time_index = pd.date_range('01/01/2010', periods=5, freq='M')# 创建数据帧,设置索引df = pd.DataFrame(index=time_index)# 创建带有一些缺失值的特征df['Sales'] = [1.0,2.0,np.nan,np.nan,5.0]# 对缺失值执行插值df.interpolate()Sales
2010-01-311.0
2010-02-282.0
2010-03-313.0
2010-04-304.0
2010-05-315.0# 前向填充df.ffill()Sales
2010-01-311.0
2010-02-282.0
2010-03-312.0
2010-04-302.0
2010-05-315.0# 后向填充df.bfill()Sales
2010-01-311.0
2010-02-282.0
2010-03-315.0
2010-04-305.0
2010-05-315.0# 对缺失值执行插值df.interpolate(limit=1, limit_direction='forward')Sales
2010-01-311.0
2010-02-282.0
2010-03-313.0
2010-04-30NaN
2010-05-315.0处理时区# 加载库import pandas as pdfrom pytz import all_timezones# 展示十个时区all_timezones[0:10]'''['Africa/Abidjan', 'Africa/Accra', 'Africa/Addis_Ababa', 'Africa/Algiers', 'Africa/Asmara', 'Africa/Asmera', 'Africa/Bamako', 'Africa/Bangui', 'Africa/Banjul', 'Africa/Bissau'] '''# 创建 datetimepd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')# Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London') # 创建 datetimedate = pd.Timestamp('2017-05-01 06:00:00')# 设置时区date_in_london = date.tz_localize('Europe/London')# 修改时区date_in_london.tz_convert('Africa/Abidjan')# Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan') 平移时间特征# 加载库import pandas as pd# 创建数据帧df = pd.DataFrame()# 创建数据df['dates'] = pd.date_range('1/1/2001', periods=5, freq='D')df['stock_price'] = [1.1,2.2,3.3,4.4,5.5]# 将值平移一行df['previous_days_stock_price'] = df['stock_price'].shift(1)# 展示数据帧dfdatesstock_priceprevious_days_stock_price
02001-01-011.1NaN
12001-01-022.21.1
22001-01-033.32.2
32001-01-044.43.3
42001-01-055.54.4滑动时间窗口# 加载库import pandas as pd# 创建 datetimetime_index = pd.date_range('01/01/2010', periods=5, freq='M')# 创建数据帧,设置索引df = pd.DataFrame(index=time_index)# 创建特征df['Stock_Price'] = [1,2,3,4,5]# 计算滑动均值df.rolling(window=2).mean()Stock_Price
2010-01-31NaN
2010-02-281.5
2010-03-312.5
2010-04-303.5
2010-05-314.5# 识别滑动时间窗口中的最大值df.rolling(window=2).max()Stock_Price
2010-01-31NaN
2010-02-282.0
2010-03-313.0
2010-04-304.0
2010-05-315.0选择日期时间范围# 加载库import pandas as pd# 创建数据帧df = pd.DataFrame()# 创建 datetimedf['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')如果数据帧未按时间索引,请使用此方法。 # 选择两个日期时间之间的观测df[(df['date'] > '2002-1-1 01:00:00') & (df['date'] <= '2002-1-1 04:00:00')]date
87622002-01-01 02:00:00
87632002-01-01 03:00:00
87642002-01-01 04:00:00如果数据帧按时间索引,请使用此方法。 # 设置索引df = df.set_index(df['date'])# 选择两个日期时间之间的观测df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']date
date
2002-01-01 01:00:002002-01-01 01:00:00
2002-01-01 02:00:002002-01-01 02:00:00
2002-01-01 03:00:002002-01-01 03:00:00
2002-01-01 04:00:002002-01-01 04:00:00
|