numpy、pandas

numpy：

仨属性：ndim-维度个数；shape-维度大小；dtype-数据类型。

numpy和pandas各def的axis缺省为0，作用于列，除DataFrame的.sort_index()和.dropna()外。

import numpy as np

相同值=np.ones((3,5),int) #同类：np.zeros()，np.empty()；首参shape用()或[]均可

转换类型=相同值.astype(np.float64) #转换行列=相同值.transpose()

随机整数=np.random.randint(-30,70,(5,6)) #size参数用()或[]如[5,6]都行

随机带小数=np.random.uniform(-5,9,(5,6)) #返回二维[[][]…]

带2=np.random.uniform(9,size=6) #一维[]，6个<9的带小数；参数无min时size=不能省

随机纯小数=np.random.rand(5,6) #标准正态分布=np.random.randn(5,6)

自然数=np.arange(10,40).reshape((5,6)) #1维np.arange()转2维np.array()，用.reshape((行,列))

自2=np.arange(10,40);np.random.shuffle(自2);自2=自2.reshape((5,6)) #shuffle洗牌不了多维

多种类=np.array([range(0,5)]*4+[list('abcde')]+[range(2,7)]) #range、[]各再套个[]就能用+

去重并排序=np.unique(多种类) #返回无重复元素的一维[]，像是OrderedSet

索引与切片=自然数[:,3:] #首行用[0]，首列用[:,0]；自然数[2,3]等价于自然数[2][3]

元素判断1=np.all(随机整数<10) #反义词np.any()

元素判断2=np.where(随机整数>0,'正数','负数')

元素计算=np.ceil(随机带小数) #地板floor()，四舍五入rint()，绝对值abs()，非数值isnan()

元素累计=np.cumsum(自然数,1) #二维可用axis，为0累下为1累右，无返1维；累乘cumprod()

#只写首参统计全维，返回1个int；2维若用了2参axis(列0行1)则按各列或各行统计，返回[]

矩阵统计=np.sum(自然数,0) #mean()､max()､min的索引argmin()､标准差std()､var()

多矩阵运算1=np.multiply(自然数,自2) #各矩阵的形状要相同；np.divide(*,*)

多矩阵运算2=自然数+自然数 #自然数-5；自然数*2；1/自然数

#多个条件用&或|连接，而非Python的and或or；每个条件都用()括起来

条件索引=随机整数[(自然数>=20) & (自然数%2==0)] #同上，各矩阵的行&列数要一致

****************************************分割线****************************************

pandasのSeries和DataFrame：

Series：

import pandas as pd

sr=pd.Series(range(10,20),index=list(string.ascii_letters[:10])) #{2016:8,2017:5}

sr.name='temp' #sr.index.name='年份'

单索引=sr[2] #或sr['c']；DataFrame中：某列用df['str']或df.str，某行为df.iloc[[int],]

切片=sr[2:5] #按标签切时如sr['c':'e']，包含终止索引

不连续索引=sr[[0,4,6]] #或sr[['a','e','g']]，注意是两对[ ]

首或尾=sr.tail(6) #.head()或.tail()内若没写数字，则默认取5条记录

sr[0:7].values[1]=666 #.values属性的值可变，索引列.index不可单个变，类似[]与()

元素计算=sr**2 #在元素层面自定义的map方法：sr.map(lambda m:'%0.4d' %(m**2))

sr[sr<15]=-sr*3+7 #对布尔索引为True的那些元素做各种运算

**********************分割线**********************

DataFrame：

三种索引——int型序号索引iloc、str型标签索引loc、混合索引ix。

序号切片行时如df.iloc[0:3,]，可简写为df[0:3]；标签loc的简写相反，只索引列有3种场景，非切片列的那俩即单列&不连续列的索引都可简写；另外切片时，df.loc[]和df.ix[]若用了标签，则行&列都包含终止。

import numpy as np,pandas as pd,string

data=np.arange(11,41).reshape((6,5))

index=pd.date_range('20170101',periods=6,freq='5D').strftime('%x')

columns=list(string.ascii_uppercase[:5])

df=pd.DataFrame(data,index,columns)

*******分割线*******

索引：

单行索引=df.iloc[[2],] #若里面没套[ ]，则是纵向输出此行

不连续的行索引=df.iloc[[0,2,3],]

行切片=df[0:4] #df.iloc[0:4,]的简写

单列索引=df['B'] #df.loc[:,'B']的简写

不连续列的索引=df[['A','D']] #df.loc[:,['A','D']]的简写

列切片=df.loc[:,'A':'D'] #标签索引的切片竟包含终止，位置索引的切片正常不含终止

行列索引1=df.iloc[[0,1,3],1:4] #除iloc切片行或loc非切片列外，其他场景均要写.iloc或.loc

行列索引2=df.ix[0:2,['A','E']] #混合索引df.ix[*,*]是否含终止，取决于1参或2参是否用了标签

*******分割线*******

计算：

df['W']=df.E+5 #del(df['B'])，被增、删的字段都用df['key']，不能用简写的df.key

df['m']=list('qwerty');df['m']=df['m'].str.upper()

多值替换为一值=df.replace([20,30,'Y'],8)

多值替换为同量=df.replace([11,22,'Q'],[9,9,6])

统计=df.describe() #数值型的那些列的8行统计信息

按值排序=df.sort_values('E',ascending=False) #若是按某行的值排序，则加axis=1

标签排序=df.sort_index(axis=1) #这次例外了，axis缺省的0值作用于行标签

元素计算=df.copy()*3;元素计算[df<26]=-df+55 #对布尔索引为True的那些元素做各种运算

元素计算の内置函数=df.cumsum(axis=1,skipna=False) #max､mean､median､cumprod…

自定义=df.applymap(lambda m:'%0.5d' %(-m**2 if len(set(str(m**2)))<3 else m*10))

df['F']=df['C'].map(lambda m:m*2) #map方法的范围窄，只用于Series对象

*******分割线*******

去重：

data2=np.array([range(0,5)]*3+[list('abcde')]+[range(2,7)]+[list('61834')])

df2=pd.DataFrame(data2,index,columns)

判断重复=df2.duplicated() #首次出现为False，再现为True，默认判断整行

字段B和E的值同时再现则删除=df2.drop_duplicates(['B','E'])

数据缺失为空：

df3=pd.DataFrame([list(range(3)),[1,2,np.nan],[8,np.nan,5],list('qwe')])

是否存在空数据=df3.isnull()

填充缺失=df3.fillna(100)

有缺失则丢弃整条=df3.dropna(1) #axis默认的0删的是有缺失的整行，不再作用于列

df3.dropna(1,inplace=True) #罕见的作用于自身，返回None

指定丢弃哪行哪列=df.drop(columns=['B','E']) #labels+axis的0是丢弃索引行

对齐运算：add,sub,mul,div用fill_value填充对方的缺位，都缺为NaN；运算符+-*/无填充

dfX=pd.DataFrame(list('5678'),dtype=np.int)

dfY=pd.DataFrame(np.ones([2,3],np.int))

对齐运算=dfX.add(dfY,fill_value=6)

*******分割线*******

DataFrame↔Series：

sr=df.stack() #DataFrame→Series，columns列标转为了内层index

df1=sr.unstack(0).reindex(list('DBAEC')) #…按行标自排；0转外层index为列标，同df.transpose()

注：Series有内外两层index才能用unstack()。而正常的单index的Series转为DataFrame：sr.to_frame().T或pd.DataFrame(sr).transpose()。.T是.transpose()的简写版。

双层索引：

df4=pd.DataFrame(data,index=pd.MultiIndex.from_product([list('py'),list('618')]))

根据索引的级别统计=df4.sum(level=1) #axis默认的0上下相加，level默认的0按外层统计

行的内层标签降序后再索引行外层标签p=df4.sort_index(0,1,False).loc['p',] #p后有,才是行标签

索引行内层=df4.swaplevel().loc['8'] #无法索引内层，交换至外层才行，8是标签非序号，故套''

*******分割线*******

矩阵拼接：

merge：以某些列标为外键，左右拼接俩矩阵；how默认交集，另有outer,left,right

data5=np.arange(13,33).reshape((4,5))

df5=pd.DataFrame(data5,range(15,31,5),list('CDEST'))

左右连接1=pd.merge(df,df5,'outer',['D','E'],suffixes=(':左',':右'))

左右连接2=pd.merge(df,df5,'right',left_on='C',right_index=True,suffixes=(':L',':R'))

print(df,df5,左右连接2,sep='\n')

concatenate：无所谓外键，首参中的矩阵数≥2；pd中各矩阵的行&列数可不等，np不行

concatのnp=np.concatenate([data,data5]) #同列数可上下合并，同行数可用左右合的axis=1

concatのpd=pd.concat([df,df5],1,'inner') #丢弃有NaN的行；默认的0+outer为上下贪婪

*******分割线*******

#单列即Series才有.value_counts()频次统计，降序输出，默认False为频次，频率用True

某列各值的频率的倒数3名并保留2位小数=df['E'].value_counts(True).sort_values()[:3].round(2)

数字列按某些列值如[df['H'].dtypes,list('中美')]，分组聚合并降序，str列只有数量.size()：

应用多个聚合：.agg(['describe','size',自定义函数])；行&列数一致：.transform([np.sum,lambda df:df-df.mean()])

import matplotlib.pyplot as plt

df['G']=list('分分分分组组');df['H']=list('甲乙甲乙丙甲')

r=df['E'].groupby([df['G'],df['H']]).mean().round(1).sort_values(0,False)

r.to_csv('E:/666.csv',encoding='utf-8-sig',mode='a+')

r.iloc[:10].plot.bar(rot=0) #可视化top10

plt.rcParams['font.sans-serif']=['simhei'];plt.show()

透视表：

values中多个int字段，不同的值汇总依据：{'x':len,'y':[numpy.mean,numpy.sum]}；len计数

df=pd.read_excel('F:/学生信息.xlsx',0,dtype=np.str) #学号11位+，读入为文本格式

table=pd.pivot_table(df,['学号',],['教学点','班级'],['性别','专业'],[len,np.min],0,True)

query=table.query("教学点==['山南','林芝']") #仅行标签能用query

print(query)

table.to_excel('学籍信息.xlsx') #to_csv则外层index无分组效果

****************************************分割线****************************************

把矩阵顺时针旋转90°：

import numpy as np,pandas as pd

matrix=np.arange(1,31).reshape((5,6))

matrix=np.rot90(matrix,1,(1,0)) #顺时针转90°；3参默认的(0,1)是逆时针，得转3次

matrix=pd.DataFrame(matrix).applymap(lambda x:'%0.2d' %x)

print(matrix)

*******分割线*******

不用numpy和pandas：

def rotate_matrix(row,column):

#生成一个row行column列的自然数矩阵

matrix = [[format(y+x*column,'02') for y in range(1, column + 1)] for x in range(row)]

[print(' '.join(x)) for x in matrix]

print('*'*20,'分割线','*'*20,sep='')

[print(' '.join(x)) for x in zip(*matrix[::-1])]

rotate_matrix(7,3)

****************************************分割线****************************************

Egの网页表格导出为本地表格：

import pandas as pd,requests

for page in range(1,10):

url=f'http://www.xicidaili.com/wt/{page}'

html=requests.get(url,headers={'User-Agent':'Mozilla/5.0 Chrome/64'}).text

df=pd.read_html(html)[0].iloc[1:,[1,2,4,5]] #首张表的不含标题行的某些列

df.to_csv('代理.csv',header=False,index=False,mode='a+',encoding='utf-8-sig')

**********************分割线**********************

Egの捞月狗LOL的皮肤分析：

1、爬皮肤数据.py

import requests,re

from fake_useragent import UserAgent as ua

from pandas import DataFrame as df

url='http://www.laoyuegou.com/x/zh-cn/lol/lol/godrank.html?region=cn&area=1'

x='.{1,600}?';y='([一-龥]+)'

pattern=re.compile(f'item1">(#\d+){x}server">{y}{x}em>(\d+){x}span>(\d+){x}span\

>(\d+){x}age">({x}%){x}score/({x}).png{x}dan">({x})<{x}"{y}"{x}"{y}"{x}"{y}"', re.S)

def downLoad():

r=requests.get(url,headers={'User-Agent':'Mozilla/5.0 Chrome/64 Safari/537'}).text

area=re.findall('cn-li.*?(http.*?)".*?([一-龥]{2,9})',r,re.S)

for region in area:

players=[]

for page in range(1,6):

r=requests.get(region[0]+f'&page={page}',headers={'User-Agent':ua().random})

print(f'下载{region[1]}战区的第{page}页')

players.extend(pattern.findall(r.text))

df(players).to_csv('LOL.csv',header=False,index=False,mode='a+',encoding='utf-8-sig')

#downLoad()

*******分割线*******

2、作图分析

import pandas as pd,matplotlib.pyplot as plt

#%matplotlib inline #IPython环境下绘图时，使图片内嵌在交互窗口，而非弹出

plt.rcParams['font.sans-serif']=['SimHei'];plt.rcParams['axes.unicode_minus']=False

plt.style.use('ggplot') #风格设为仿R语言的ggplot库

plt.rcParams['axes.labelsize']=20 #轴标题的字体尺寸

plt.rcParams['xtick.labelsize']=9;plt.rcParams['ytick.labelsize']=9 #轴刻度的字体尺寸

plt.rcParams['legend.fontsize']=12;plt.rcParams['figure.figsize']=[15,6]

#pd.read_*：首参若为路径str则不能有中文，用open('路径str',encoding='*')可以有

df=pd.read_csv('E:/py/LOL.csv',encoding='utf8')

#pd.DataFrame().iterrows()：各(序号,Series)组成的generator

#for row in df.iterrows():print(row[1][0]) #row[1]是Series对象，纵向显示各行记录

#print(df.describe())

#df['位置'].value_counts().drop('--').plot(kind='bar',rot=45,color='r') #图表,轴坐标斜度,颜色

pd.concat([df[f'本命英雄{x}'] for x in range(1,4)]).value_counts()[:10].plot(kind='barh',rot=0)

plt.show() #玩家最常用的前10个本命英雄，其频次的水平条形图

****************************************分割线****************************************

matplotlib：

plt.savefig()写在.show()之前或不写.show()，可设置保存时裁白边。

裁白边前的分辨率plt.figure()，写在.plot()等绘图法之前，值为dpi*figsize中的(宽,高)英寸。

主体颜色c、散点的轮廓色edgecolors、俩折线图间的填充色facecolor：①'r､g､b或完整单词或无色的none'；②各值在[0,1]间的(r,g,b)；③scatter散点图的4参为list，值越大则红绿蓝越深，有3类：列表=range(len(x或y))则先浅后深、=y则y↑越深，=x则x→越深。

如下几例的共同代码：

from datetime import datetime as dt

from numpy.random import randint,choice,shuffle

import numpy as np,pandas as pd,matplotlib.pyplot as plt

plt.rcParams['font.sans-serif']=['SimHei'];plt.rcParams['axes.unicode_minus']=False

plt.figure(dpi=160,figsize=(8,5)) #参数也可写在plt.savefig(**kwargs)里

#plt.xkcd() #漫画风格，不支持中文标题；plt.rcdefaults()重置为正常风格

***************分割线***************

#字体尺寸：图表标题用fontsize，轴刻度用labelsize，轴标题随选用的方法而定

def Egの大杂烩1():

plt.title('涂鸦1',fontsize=24) #图表标题及其字体尺寸

plt.xlabel('x轴',fontsize=16);plt.ylabel('y轴') #轴标题，或plt.rcParams['axes.labelsize']=16

plt.tick_params(axis='both',labelsize=9) #轴刻度尺寸，或plt.rcParams['xtick.labelsize']=9

#plt.xticks([-np.pi,0,np.pi],['-π','0','π']);plt.yticks([3,7]) #轴刻度只展示这些，[]为白板

plt.axis([dt(2017,6,1,0,0),dt(2017,9,30,0,0),9,21]) #轴刻度的展示区间：前俩x，后俩y

x=pd.date_range('2017-6-1',periods=9,freq='2W').to_pydatetime();shuffle(x)

y=randint(10,18,9)

plt.plot(x,y,'og',linewidth=3) #折线图；3参中颜色选rgb，形状如--.或圆点o，顺序随意

plt.fill_between(x,y,randint(15,21,9),facecolor='gray',alpha=0.2) #两对y间的填充色

plt.scatter(x,y,60,range(9),4,plt.cm.Reds,edgecolors=(0,0.6,1)) #先浅后深的散点图，marker∈(0,11)

plt.savefig('E:/涂鸦.png',bbox_inches='tight')

plt.show()

#Egの大杂烩1()

def Egの大杂烩2():

plt.title('涂鸦2',fontsize=20);plt.xlabel('x轴',fontsize=18);plt.ylabel('y轴')

plt.xticks([-np.pi,0,np.pi,np.pi*3],['-π','0','1π','3π']);plt.yticks([-1,-0.5,0,0.5,1])

x=np.linspace(-np.pi*2,np.pi*3,90)

y1=np.sin(x);y2=np.cos(x)

plt.bar(x[x>2],y1[x>2],label='直方图')

#rgb中的单个字母是颜色，圆点o或--.等是形状，颜色和形状哪个在前随意

plt.plot(x,y1,'bo');plt.plot(x,y2,'--.r') #x轴,y轴,颜形；仅1个序列则为y轴，x轴自动用索引

plt.fill_between(x,y1,y2,facecolor='gray',alpha=0.2)

#plt.imshow(x.reshape(6,15)[:2,4:]) #参数为2维及以上的array

plt.show()

#Egの大杂烩2()

***************分割线***************

class RandomWalk:

def __init__(self,num=8000):

self.x,self.y,self.num=[0],[0],num

def steps(self):

while len(self.x)<self.num: #原地踏步的位移不取

xStep,yStep=choice([-1,1])*randint(81),choice([-1,1])*randint(49)

if not (xStep==0 and yStep==0):

self.x.append(self.x[-1]+xStep)

self.y.append(self.y[-1]+yStep)

def Egの随机漫步():

rw=RandomWalk(10000)

rw.steps()

plt.scatter(rw.x,rw.y,1,range(rw.num),cmap=plt.cm.Greens)

#突出起点和终点，隐藏坐标轴

plt.scatter(0,0,30,'red',edgecolors='none');plt.scatter(rw.x[-1],rw.y[-1],30,'blue')

plt.axis('off') #.imshow(*)后.show()前；藏y轴：plt.axes().get_yaxis().set_visible(False)

plt.show()

#Egの随机漫步()

相关文章