Python Pandas基础操作详解

				?

									'''

									series {索引 + 数据} 形式

									索引是自动生成的

									'''

									#通过 list 创建

									s1 = pd.Series([1, 2, 3, 4, 5])

									#通过np数组创建

									arr1 = np.arange(10)

									s2 = pd.Series(arr1)

									#自定义索引

									s2 = pd.Series(arr1, index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])

									#单独查看值或索引

									print(s1.values)

									print(s1.index)

									#字典索引超出 会显示nan 值 不会像数组创建series一样报错

									#通过字典来创建series  由于字典无序 所以每次打印顺序可能不同, 所以可以添加索引 保证顺序

									dict1 = {'姓名': '李宁', '班级': '三班', '年龄': '22'}

									print(dict1)

									s3 = pd.Series(dict1, index=['姓名', '班级', '年龄', '性别'])

									#判断values是否为空nan

									print(s3.isnull())

									#判断values是否不为空

									print(s3.notnull())

									#通过下标取数据

									print(s3[1])

									#通过标签名取数字

									print(s3['姓名'])

									#选取多个

									print(s2[[1, 5]])

									#切片取值

									print(s2[1:4])              #索引切边 是 左闭右开

									print(s2['b':'h'])          #标签切片可以包含末端数据 如h

									#bool索引取值

									print(s2[s2>5])

									#索引与数据的对应关系 不被 运算所影响

									#name 属性

									s2.name = '佳林'          #数组对象名---values标题

									s2.index.name = '字母表'   #索引名  ---- index标题

									#查看前三行

									print(s2.head(3))

									#查看后两行

									print(s2.tail(2))

				?

									#构造多类型字典

									data = {

									    'a': [1, 2, 3, 4],

									    'b': (5, 6, 7, 8),

									    'c': np.arange(9, 13)

									}

									frame = pd.DataFrame(data)

									#查看行索引

									print(frame.index)

									#查看列索引

									print(frame.columns)

									#查看values

									print(frame.values)                        #返回nparray类型的二维数组

									#指定行索引

									frame = pd.DataFrame(data, index=['A', 'B', 'C', 'D'])

									#指定列行索引

									frame = pd.DataFrame(data, index=['A', 'B', 'C', 'D'], columns=['a', 'b', 'c', 'd'])

									#series构成的字典构造dataframe

									pd1 = pd.DataFrame({'a': pd.Series(np.arange(5)),

									                    'b': pd.Series(np.arange(3, 5))

									                    })

									#dataframe的每列元素类型必须统一

									#通过字典构造的字典来构造dataframe（嵌套）

									data1 = {

									    'a': {

									        'apple': '3.6',

									        'banan': '3.5'

									    },

									    'b': {

									        'apple': '3.6',

									        'banan': '3.5',

									        'red': '3.7',

									        'yellow': '3.8'

									    }

									}

									#最内层字典的key是index

									#外层字典的key是columns

									#通过二位数组来构造dataframe----默认columns和index都是0-n

									arr1 = np.arange(12).reshape(3, 4)

									print(arr1)

									frame1 = pd.DataFrame(arr1)

									#字典构造的列表 构造 dataframe

									li = [{'apple': '3.6', 'orange': '2.5'}, {'apple': '4.8', 'orange': '2.8'}, {'apple': '2.4'}]

									li_data = pd.DataFrame(li)

									#Series构成的列表 构成dataframe

									l2 = [pd.Series(np.random.rand(3)), pd.Series(np.random.rand(3))]

									l2_data = pd.DataFrame(l2)

				?

									ps = pd.Series(range(5))

									pd1 = pd.DataFrame(np.arange(9).reshape(3, 3),

									                   index=['a', 'b', 'c'], columns=['A', 'B', 'C'])

									#重新索引 reindex 创建一个符合新索引的新对象

									ps2 = ps.reindex(['a', 'b', 'c', 'd', 'e'])

									print(ps2)           #因为新索引与之前的索引没有对应关系 所以values全为空！！！！

									#dataframe行索引重建顺序调整

									pd2 = pd1.reindex(['a', 'b', 'c', 'd'])

									pd3 = pd1.reindex(columns= ['B', 'C', 'A', 'B'])

				?

									np.random.seed(1)

									pd1 = pd.DataFrame(np.random.randint(0, 10, size=(3, 5)), columns=['a', 'b', 'c', 'd', 'e'], index=['A', 'B', 'C'])

									print(pd1)

									#和numpy一样 进行转至 切片提取

									# print(pd1.T)

									print(pd1[:'B']['e'])          #第一个或只有一个[]默认是行索引index 第二个[]是columns

									#增加列

									pd1['f'] = [5, 5, 5]

									print(pd1)

									#删除列

									del(pd1['d'])

									print(pd1)

									#修改行索引名----只能赋值

									1\直接赋值法

									pd1.index = ['a', 'b'........]

									2\自定义函数法

									def test_map(x):

									    return x+'_ABC'

									pd1.rename(index=test_map,inplace=True)

									#修改列索引名

									1\直接赋值

									pd1.columns = []

									2\用str进行广播操作 如整体去掉某符号

									pd1.columns = pd1.columns.str.strip('$')

									3\函数法

									pd1.columns = pd1.columns.map(lambda x:x[1:])

									4\rename属性

									# 直接法（好处：也可只修改特定的列）----字典values替代key

									df.rename(columns=('$a': 'a', '$b': 'b', '$c': 'c', '$d': 'd', '$e': 'e'}, inplace=True) 

									# 函数法

									df.rename(columns=lambda x:x.replace('$',''), inplace=True)

									#提取行、列的loc和iloc

									#iloc是按索引位置提取

									#loc是按标签提取

									df.loc[:, 'a']                    #提取a列

									df.loc[:, ['a', 'c']]             #提取ac列

									df.loc[1]                         #提取行标签为1的行

									df.iloc[1]                        #提取行位置为1的行也就是第二行

									df.loc[:2]                        #提取多行

									#loc没有左闭右开

									df.loc[0:1, 'b']                  #提取行索引0-1包括1 的‘b'列

									df1.loc['a':'B', 'c':'d']         #按标签提取某范围的行列

									#多条件

									df[(df['a']<=2) & (df['b']>=5)]

									df.loc[(df['a']<=2) & (df['b']>=5)]

									# 或 条件 不能使用 or

									df[(df['a']<=2) | (df['b']>=5)]

									df.loc[(df['a']<=2) | (df['b']>=5)]

				?

									arr = np.arange(12).reshape(3, 4)

									print(arr)

									#广播 每一行都减去第一行

									print(arr-arr[0])

									#默认series的行索引 匹配的是dataframe的列索引

									df1 = pd.DataFrame(np.arange(12).reshape(4, 3), index=['a', 'b', 'c', 'd'], columns=list('ABC'))

									s3 = df1.iloc[0]        #取第一行

									print(s3)

									print(df1 - s3)

									#沿着列运算

									print(df1.sub(s4, axis= 0))

				?

									#增

									##series

									ps[4] = 9

									print(ps)

									ps1 = pd.Series({'v': 's', 'f': 's'})

									pss = ps.append(ps1)                 #append拼接 这个方法不会影响原有数据

									##dataframe

									###增加列

									df['d'] = [9, 8, 9]

									###插入

									df.insert(0, 'M', 1)            #在第0列插入M全为1

									##高级标签索引--增加行loc

									df.loc['q'] = 1

									row = {'M': 's', 'a': 'b', 'b': 'w', 'c': 'w', 'd': 8}

									dfnew = df.append(row, ignore_index=True)     #ignore_index：如果设置为true，则无视表的index，直接合并，合并后生成新的index。

									#删

									del ps[0]

									#del只能删除dataframe的列

									del df['M']

									#*******drop******删除轴上的数据

									#dataframe删除行

									print(df.drop(['S', 'W']))

									#指定轴删除列

									print(df.drop(['a', 'c'], axis=1))

									ps = pd.Series(range(1, 5))

									#改

									ps[0] = 888

									print(ps)

									df.a = 6

									#修改行数据

									df.loc['S'] = 888

									#修改单个元素

									df.loc['D', 'b'] = 8848

				?

									data = {'a': 'aeac@qq.com', 'b': 'stevan@famil.com', 'c': 'asda@asd.com', 'd': np.nan}

									data = pd.Series(data)

									print(data)

									print(data.isnull())

									#字符串查找

									print(data.str.contains('qq'))

									#分割

									print(data.str.split(r'@'))

									print(data.str.findall(r'@'))

									#切片

									print(data.str[:5])

				?

									a         aeac@qq.com

									b    stevan@famil.com

									c        asda@asd.com

									d                 NaN

									dtype: object

									a    False

									b    False

									c    False

									d     True

									dtype: bool

									a     True

									b    False

									c    False

									d      NaN

									dtype: object

									a         [aeac, qq.com]

									b    [stevan, famil.com]

									c        [asda, asd.com]

									d                    NaN

									dtype: object

									a    [@]

									b    [@]

									c    [@]

									d    NaN

									dtype: object

									a    aeac@

									b    steva

									c    asda@

									d      NaN

									dtype: object

				?

									pd.merge（data1, data2, on= '按照哪一行合并', how = 'left或right或outer或inner'）

									pd.merge(df_obj5, df_obj6, how='outer', left_index=True, right_index=True)

									pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)

									pd.concat([df1, df2], join='inner\outer', axis=1

									stack 列索引在最外层 columns在内层 变成series

									外层索引为index内层索引变成columns--unstack()

									g = df1.groupby(by='fruit')

									for name,group in g:

									    print(name)

									    print('-'*30)

									    print(group)

									apple

									------------------------------

									   fruit color  price

									0  apple   red    8.5

									3  apple  cyan    7.8

									banana

									------------------------------

									    fruit   color  price

									1  banana  yellow    6.8

									4  banana    cyan    6.4

									orange

									------------------------------

									    fruit   color  price

									2  orange  yellow    5.6

									#利用字典来获取具体分组名的dataframe

									s = dict(list(df1.groupby(by='fruit')))

									s['apple']

									def diff(arr):

									    return arr.max() - arr.min()

									df1.groupby(by='fruit')['price'].agg(diff)

秒客网

Python Pandas基础操作详解

目录

数据结构&Series：

DataFrame的构建：

索引操作：

DataFrame基本操作：

广播运算：

索引增删改查：

字符串元素处理：

数据规整：

总结

相关文章