Pandas
关键词:数据分析库
1.数据结构
import numpy as np
import pandas as pd
def main():
#Data structure
s=pd.Series([i*2 for i in range(1,11)])
print(type(s))
dates=pd.date_range("20170301",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
print(df)
df=pd.DataFrame({"A":1,"B":pd.Timestamp("20170301"),"C":pd.Series(1,index=list(range(4)),dtype="float32"),\
"D":np.array([3]*4,dtype="float32"),"E":pd.Categorical(["police","student","teacher","doctor"])})
print(df)
pass
if __name__ == '__main__':
main()
2.基本操作
import numpy as np
import pandas as pd
def main():
#Data structure
s=pd.Series([i*2 for i in range(1,11)])
print(type(s))
dates=pd.date_range("20170301",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
print(df)
print(df.head(3))#输出前三行
print(df.tail(3))#输出后三行
print(df.index)
print(df.values)
print(df.T)#转置
print("sort=")
'''按值排序'''
print(df.sort_values(by='C',ascending=False))#降序
'''按索引排序'''
print(df.sort_index(ascending=False))
'''输出df的各个属性'''
print(df.describe())
'''切片'''
print(type(df["A"]))
print(df[:3])
print(df["20170301":"20170305"])
print(df.loc[dates[0]])
print(df.loc["20170301":"20170304",["B","D"]])
print(df.at[dates[0],"C"])
print(df.iloc[1:3,2:4])
print(df.iat[1,4])
print(df[df.B>0][df.A<0])
print(df[df>0])
print(df[df["E"].isin([1,2])])
#set
s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))
df["F"]=s1
print(df)
df.at[dates[0],"A"]=0
print(df)
df.iat[1,1]=1
df.loc[:,"D"]=np.array([4]*len(df))
print(df)
'''拷贝'''
df2=df.copy()
df2[df2>0]=-df2
pass
if __name__ == '__main__':
main()
3.丢失值处理
import numpy as np
import pandas as pd
def main():
dates=pd.date_range("20170301",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
print(df)
df1=df.reindex(index=dates[:4],column=list('ABCD')+["G"])
df1.loc[dates[0]:dates[1],"G"]=1
pass
if __name__ == '__main__':
main()
4.数据融合和形状定义
import numpy as np
import pandas as pd
def main():
dates=pd.date_range("20170301",periods=8)
df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
print(df.mean())#均值
print(df.var())#方差
s=pd.Series([1,2,2,np.nan,5,7,9,10],index=dates)
print(s)
print(s.shift(2))#值向后移动2个 并用nan填充
print(s.diff(2))#第几行和第几行的差值
print(s.value_counts())#统计数的出现次数
print(df.apply((np.cumsum)))
'''表格拼接'''
pieces=[df[:3],df[-3:]]
print(pd.concat((pieces)))
left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
right=pd.DataFrame({"key":["x","z"],"value":[3,4]})
print("left=")
print(left)
print("right=")
print(right)
print(pd.merge(left,right,on="key",how="left"))#拼接
pass
if __name__ == '__main__':
main()
5.时间序列,图形绘制,文件操作