< />

Pandas kisara

Pandas

关键词:数据分析库

1.数据结构

import numpy as np
import pandas as pd
def main():
    #Data structure
    s=pd.Series([i*2 for i in range(1,11)])
    print(type(s))
    dates=pd.date_range("20170301",periods=8)
    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
    print(df)
    df=pd.DataFrame({"A":1,"B":pd.Timestamp("20170301"),"C":pd.Series(1,index=list(range(4)),dtype="float32"),\
                     "D":np.array([3]*4,dtype="float32"),"E":pd.Categorical(["police","student","teacher","doctor"])})
    print(df)
    pass
if __name__ == '__main__':
    main()

2.基本操作

import numpy as np
import pandas as pd
def main():
    #Data structure
    s=pd.Series([i*2 for i in range(1,11)])
    print(type(s))
    dates=pd.date_range("20170301",periods=8)
    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
    print(df)
    print(df.head(3))#输出前三行
    print(df.tail(3))#输出后三行
    print(df.index)
    print(df.values)
    print(df.T)#转置
    print("sort=")
    '''按值排序'''
    print(df.sort_values(by='C',ascending=False))#降序
    '''按索引排序'''
    print(df.sort_index(ascending=False))
    '''输出df的各个属性'''
    print(df.describe())
    '''切片'''
    print(type(df["A"]))
    print(df[:3])
    print(df["20170301":"20170305"])
    print(df.loc[dates[0]])
    print(df.loc["20170301":"20170304",["B","D"]])
    print(df.at[dates[0],"C"])
    print(df.iloc[1:3,2:4])
    print(df.iat[1,4])
    print(df[df.B>0][df.A<0])
    print(df[df>0])
    print(df[df["E"].isin([1,2])])

    #set
    s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))
    df["F"]=s1
    print(df)
    df.at[dates[0],"A"]=0
    print(df)
    df.iat[1,1]=1
    df.loc[:,"D"]=np.array([4]*len(df))
    print(df)
    '''拷贝'''
    df2=df.copy()
    df2[df2>0]=-df2
    pass
if __name__ == '__main__':
    main()

3.丢失值处理

import numpy as np
import pandas as pd
def main():
    dates=pd.date_range("20170301",periods=8)
    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
    print(df)
    df1=df.reindex(index=dates[:4],column=list('ABCD')+["G"])
    df1.loc[dates[0]:dates[1],"G"]=1
    pass
if __name__ == '__main__':
    main()

4.数据融合和形状定义

import numpy as np
import pandas as pd
def main():
    dates=pd.date_range("20170301",periods=8)
    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))
    print(df.mean())#均值
    print(df.var())#方差
    s=pd.Series([1,2,2,np.nan,5,7,9,10],index=dates)
    print(s)
    print(s.shift(2))#值向后移动2个 并用nan填充
    print(s.diff(2))#第几行和第几行的差值
    print(s.value_counts())#统计数的出现次数
    print(df.apply((np.cumsum)))

    '''表格拼接'''
    pieces=[df[:3],df[-3:]]
    print(pd.concat((pieces)))
    left=pd.DataFrame({"key":["x","y"],"value":[1,2]})
    right=pd.DataFrame({"key":["x","z"],"value":[3,4]})
    print("left=")
    print(left)
    print("right=")
    print(right)
    print(pd.merge(left,right,on="key",how="left"))#拼接
    pass
if __name__ == '__main__':
    main()

5.时间序列,图形绘制,文件操作

kisarawechat kisaraalipay