01.pandas

时间:2023-03-08 17:41:14
01.pandas

01.Series

 # -*- coding: utf-8 -*-
 """
 Series 객체 특징
  - pandas 제공 1차원 자료구성
  - DataFrame 칼럼 구성요소
  - 수학/통계 관련 함수 제공
  - 범위 수정, 블럭 연산
  - indexing/slicing(list 동일)
  - 시계열 데이터 처리
 """

 import pandas as pd #pd.Series()
 from pandas import Series

 #1.Series 생성

 #1) List 이용
 list=[4000,3000,2000,3500]
 print(list*2) #[4000, 3000, 2000, 3500, 4000, 3000, 2000, 3500]

 price=Series([4000,3000,2000,3500])
 print(price*2)
 """
 0    8000
 1    6000
 2    4000
 3    7000
 dtype: int64
 """
 print("index=",price.index)#index index= RangeIndex(start=0, stop=4, step=1)
 print("value=",price.values)#data value= [4000 3000 2000 3500]
 print(list[0],price[0]) #4000 4000

 # 2) dic 이용 key=index : value=values
 person=pd.Series({'name':'홍길동','age':35,'addr':'서울시'})
 print(person)
 """
 addr    서울시
 age      35
 name    홍길동
 dtype: object
 """

 # 2. indexing(list와 동일)
 ser_data=pd.Series([4,4.5,6,8,10.5])

 print(ser_data[0]) #4.0
 print(ser_data[:3])
 """
 0    4.0
 1    4.5
 2    6.0
 dtype: float64
 """
 print(ser_data[3:])
 """
 3     8.0
 4    10.5
 dtype: float64
 """
 print(ser_data[:])
 """
 0     4.0
 1     4.5
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """
 #print(ser_data[-1]) # - 사용할수 없다

 # Boolean 조건
 print(ser_data[ser_data>=5])
 """
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """

 # 3. Series 결합, NA 처리
 data1=Series([4000,None,3000,2000],
              index=['a','m','o','k'])

 data2=Series([4000,3000,3500,2000],
              index=['a','o','k','m'])

 #join :index 기준
 resualt=data1+data2 # 블럭 연산
 print(resualt)
 """
 a    8000.0
 k    5500.0
 m       NaN 숫자+None=None
 o    6000.0
 dtype: float64
 """
 print(type(resualt))#  Series' <class 'pandas.core.series.Series'>

 #NA 처리 :0,평균 대체
 result2=resualt.fillna(0)#0 대체
 print(result2)
 """
 a    8000.0
 k    5500.0
 m       0.0
 o    6000.0
 dtype: float64
 """

 result3=resualt.fillna(resualt.mean())#평균 대체
 print(result3)
 """
 a    8000.0
 k    5500.0
 m    6500.0
 o    6000.0
 dtype: float64
 """

 print(pd.notnull(resualt))
 """
 a     True
 k     True
 m    False
 o     True
 dtype: bool
 """
 # 결측치를 제외한 subset 생성
 subset=resualt[pd.notnull(resualt)]
 print(subset)
 """
 a    8000.0
 k    5500.0
 o    6000.0
 dtype: float64
 """

 #4.Series 연산
 print(ser_data)
 """
 0     4.0
 1     4.5
 2     6.0
 3     8.0
 4    10.5
 dtype: float64
 """
 #10블럭수정
 ser_data[1:4]=50
 print(ser_data)
 """
 0     4.0
 1    50.0
 2    50.0
 3    50.0
 4    10.5
 dtype: float64
 """

 #2)수학 통계 함수
 print(ser_data.sum())#164.5
 print(ser_data.mean())#32.9
 print(ser_data.max())#50.0
 print(ser_data.min())#4.0

 #3) broadcast 연산
 print(ser_data * 0.5) #vector(1) * scala(0)
 """
 0     2.00
 1    25.00
 2    25.00
 3    25.00
 4     5.25
 dtype: float64
 """

02.DataFrame

 # -*- coding: utf-8 -*-
 """
 Created on Sat Feb  9 12:34:12 2019

 @author: 502-03
 DataFrame 객체 특징
  - Pandas제공 2차원 행렬구조 (table 구조 동일)
  - 칼럼 단위 상이한 자료형 제공
  - DataFrame 구성요소
      -> Series : 1 차원 (vector)

 """

 import pandas as pd
 from pandas import DataFrame

 #1.DataFrame 생성
 name=['홍길동','이순신','강감찬','유관순']
 age=[35,45,55,25]
 pay=[350,450,550,250]
 emp=pd.DataFrame({'name':name,'age':age,'pay':pay},
                  columns=['name','age','pay'])
 print(emp)
 """
   name  age  pay
 0  홍길동   35  350
 1  이순신   45  450
 2  강감찬   55  550
 3  유관순   25  250
 """

 #1) Series 객체 이용: colum추가
 gender = pd.Series(['M','M','M','F'])
 emp['gender']=gender
 print(emp)
 """
   name  age  pay gender
 0  홍길동   35  350      M
 1  이순신   45  450      M
 2  강감찬   55  550      M
 3  유관순   25  250      F
 """

 #2) Numpy 객체 이용
 import numpy as np
 frame = pd.DataFrame(np.arange(12).reshape(3,4),
                      columns=['a','b','c','d'])
 print(frame)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """

 # 행/열 통계 구하기
 print(frame.mean()) #열단위 평균
 """
 a    4.0
 b    5.0
 c    6.0
 d    7.0
 dtype: float64
 """
 print(frame.mean(axis=0)) #열단위 평균
 """
 a    4.0
 b    5.0
 c    6.0
 d    7.0
 dtype: float64
 """
 print(frame.mean(axis=1)) #행단위 평균
 """
 0    1.5
 1    5.5
 2    9.5
 dtype: float64
 """

 # 2. index 지정
 print(frame.index)#RangeIndex(start=0, stop=3, step=1)
 print(frame.values)
 """
 [[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]
 """
 print(frame.columns)
 """
 Index(['a', 'b', 'c', 'd'], dtype='object')
 """
 # 1) 특정 칼럼(a)로 index 지정
 set_index=frame.set_index('a')
 print(set_index)
 """
    b   c   d
 a
 0  1   2   3
 4  5   6   7
 8  9  10  11
 """
 # 2)index재 지정
 reset_index=set_index.reset_index();
 print(reset_index)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """

 #3 DF칼럼 참조

 #1)단일 칼럼 참조
 a_col1=frame.a  #DF.colum
 a_col2=frame['a'] #DF['colum']

 print(a_col1)
 """
 0    0
 1    4
 2    8
 Name: a, dtype: int32
 """
 print(a_col2)
 """
 0    0
 1    4
 2    8
 Name: a, dtype: int32
 """

 a_col2=frame['a'][2] #DF['colum'][index]

 #2) 복수 칼럼 참조
 print(frame[['a','c']]) # [['a':'c']](x)
 """
    a   c
 0  0   2
 1  4   6
 2  8  10
 """

 cols=['a','b']
 frame[cols]

 # 4.Make Subset
 #2)특정칼럼 제외
 print('subset1')
 subset_df=frame[['a','c','d']]
 print(subset_df)
 """
    a   c   d
 0  0   2   3
 1  4   6   7
 2  8  10  11
 """

 #20특정행 제외
 print('drop')
 print(frame.drop(0)) #1행 제거
 """
    a  b   c   d
 1  4  5   6   7
 2  8  9  10  11
 """
 print(frame.drop(1)) #2행 제거
 """
    a  b   c   d
 0  0  1   2   3
 2  8  9  10  11
 """

 #해당원소가 제외된 새로운 make new object
 a_col=frame['a'] #DF(2)->vector(1)
 print(type(a_col))#<class 'pandas.core.series.Series'>

 #a칼럼 기준으로 행 삭제
 sunbset_df2=frame #df 볻제
 print(sunbset_df2)
 """
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 """
 for i,c in enumerate(a_col):
     print('i=',i,'c=',c)
     if c < 5 :
         sunbset_df2=sunbset_df2.drop(i)
 """
 i= 0 c= 0
 i= 1 c= 4
 i= 2 c= 8
 """

 print(sunbset_df2)
 """
    a  b   c   d
 2  8  9  10  11
 """

 #3)칼럼 많은 경우
 iris=pd.read_csv("../data/iris.csv")
 print(iris.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 150 entries, 0 to 149
 Data columns (total 5 columns):
 Sepal.Length    150 non-null float64
 Sepal.Width     150 non-null float64
 Petal.Length    150 non-null float64
 Petal.Width     150 non-null float64
 Species         150 non-null object
 dtypes: float64(4), object(1)
 memory usage: 5.9+ KB
 None
 '''
 print(type(iris)) # DataFrame
 print(iris.columns)
 """
 Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
        'Species'],
       dtype='object')
 """
 #cols = list(iris.columns) # 칼럼명 추출
 cols=iris.columns.tolist() #python 3.6 用这个
 print(cols)
 '''
 ['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
 '''
 print(iris[cols[0]]) #첫번째 칼럼
 """
 0      5.1
 1      4.9
 2      4.7
 3      4.6
 ...
 145    6.7
 146    6.3
 147    6.5
 148    6.2
 149    5.9
 """
 print(iris[cols[-1]])# 마지막 칼럼
 """
 0         setosa
 1         setosa
 2         setosa
 3         setosa
 4         setosa
 ...
 146    virginica
 147    virginica
 148    virginica
 149    virginica
 """

 #1~3칼럼 참조
 print(iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length']])
 """
 0             5.1          3.5           1.4
 1             4.9          3.0           1.4
 2             4.7          3.2           1.3
 3             4.6          3.1           1.5
 4             5.0          3.6           1.4
 5             5.4          3.9           1.7
 ...
 146           6.3          2.5           5.0
 147           6.5          3.0           5.2
 148           6.2          3.4           5.4
 149           5.9          3.0           5.1
 """
 print(iris[cols[:3]]) #권장
 """
 0             5.1          3.5           1.4
 1             4.9          3.0           1.4
 2             4.7          3.2           1.3
 3             4.6          3.1           1.5
 ...
 146           6.3          2.5           5.0
 147           6.5          3.0           5.2
 148           6.2          3.4           5.4
 149           5.9          3.0           5.1
 """

 print(iris.head())
 """
 [150 rows x 3 columns]
    Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
 0           5.1          3.5           1.4          0.2  setosa
 1           4.9          3.0           1.4          0.2  setosa
 2           4.7          3.2           1.3          0.2  setosa
 3           4.6          3.1           1.5          0.2  setosa
 4           5.0          3.6           1.4          0.2  setosa
 """

 #1~4칼럼 :x,5칼럼 :y
 iris_x=iris[cols[:4]]
 iris_y=iris[cols[-1]]

 print(iris_x.shape)#(150, 4)  2차원
 print(iris_y.shape)#(150,)   1차원

 #5.DF 행렬 참조 : R  [row,col1:col3]
 '''
 DF.ix[row index or lable,col index or lable]
  - DF 대상으로 행과 열의 index(숫자) or lable(문자) 참조
  - 연속 데이터는 (:) 사용가능
  - lable이 숫자면 lable-based 참조
 '''
 #DF.ix[row,col]
 print('frame')
 print(frame)
 '''
    a  b   c   d
 0  0  1   2   3
 1  4  5   6   7
 2  8  9  10  11
 '''
 print(frame.ix[1]) #행 default
 '''
 a    4
 b    5
 c    6
 d    7
 Name: 1, dtype: int32
 '''
 print(frame.ix[1,2]) #2행3열 6
 print(frame.ix[:,'d']) #d열 전체
 '''
 0     3
 1     7
 2    11
 Name: d, dtype: int32
 '''
 print(frame.ix[:,'b':'c']) #b~b열 전체
 """
    b   c
 0  1   2
 1  5   6
 2  9  10
 """

 print(len(iris)) #관측치 길이  -150

 import numpy as np
 idx=np.random.choice(10,5,replace=False)
 print(idx)# [3 4 9 0 6]

 idx=np.random.choice(len(iris),int(len(iris)*0.7),
                      replace=False)
 print(idx,len(idx))
 """
 [  9  75   1 138  16  24  35  90  68  73  48 147  46  80  74  89 124  94
   83   0 134  71 142   3  91  34  86  15 143  85 103  30  97  93 109 104
  125  45  69  79  49  87 108 127 139   8  33  99  37 148  18  23  41  11
  117  60 107  43  50  58 149 136 100 120  92   6  77  76  84  88  47  95
   25  72  29 118 106 141  17  32   5  26 132 112  31   2  52  19  51  98
  144 128  27  21 121  14  63 122  20  66 145  78   4  81  44] 105
 """

 train_set=iris.ix[idx,:]
 print(train_set.shape)#(105, 5)

03.Descriptive

 # -*- coding: utf-8 -*-
 """
 1. DataFrame 요약통계량
 2. 변수 간의 상관성 분석
 """

 import pandas as pd

 product = pd.read_csv('../data/product.csv')
 print(product.info())

 # 기술통계량 구하기
 summary = product.describe()
 print(summary)

 # 행/열 통계량 구하기  : axis=0 or 1
 print(product.sum(axis = 0)) # 열 합계
 '''
 a    773
 b    827
 c    817
 '''
 print(product.sum(axis = 1)) # 행 합계 

 # 산포도
 print(product.var()) # 분산
 print(product.std()) # 표준편차 

 # 빈도수
 a_cnt = product['a'].value_counts()
 print(a_cnt)
 '''
 3    126
 4     64
 2     37
 1     30
 5      7
 '''

 # 중복 제외
 b_uni = product['b'].unique()
 print(b_uni) # [4 3 2 5 1]

 # 변수 간의 상관분석( -1 < r < 1)
 p_corr = product.corr()
 print(p_corr)
 '''
           a         b         c
 a  1.000000  0.499209  0.467145
 b  0.499209  1.000000  0.766853
 c  0.467145  0.766853  1.000000
 '''

 ac_corr = product['a'].corr(product['c'])
 print(ac_corr) # 0.4671449836008965

 #문) iris 1 ~ 4 칼럼 -> 상관분석(r)
 cols = list(iris.columns)
 print(cols) # 5개 칼럼 list
 iris_sub = iris[cols[:4]]

 print(iris_sub.corr())

04.merge

 # -*- coding: utf-8 -*-
 """
 DataFrame marge
 """

 import pandas as pd

 wdbc = pd.read_csv("../data/wdbc_data.csv")
 print(wdbc.info())
 '''
 RangeIndex: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

 cols = list(wdbc.columns)
 print(cols)

 df1 = wdbc[cols[:16]] # 1~16
 sid = wdbc['id'] # id 칼럼
 df2 = wdbc[cols[16:]] # 17~32

 df2['id'] = sid

 print(df1.shape) # (569, 16)
 print(df2.shape) # (569, 17)

 # 1. id 칼럼으로 DF 병합
 df_merge = pd.merge(df1, df2) # id 칼럼, how='inner'
 print(df_merge.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 Int64Index: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

 # 2. 칼럼 단위 df 붙이기
 df1 = wdbc[cols[:16]] # 1~16
 df2 = wdbc[cols[16:]] # 17~32

 df_merge2 = pd.concat([df1, df2], axis=1) # 열 단위 결합
 print(df_merge2.info())
 '''
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 569 entries, 0 to 568
 Data columns (total 32 columns):
 '''

05.timeSeries

 # -*- coding: utf-8 -*-
 """
 시계열 데이터 시각화
  1. 날짜형식 수정(다국어 -> 한국어)
  2. 시계열 시각화
  3. 이동평균 기능
 """

 import pandas as pd
 from datetime import datetime # 날짜형식 수정 

 cospi = pd.read_csv("../data/cospi.csv")
 print(cospi.info())
 '''
 RangeIndex: 247 entries, 0 to 246
 Data columns (total 6 columns):
 Date      247 non-null object
 Open      247 non-null int64
 High      247 non-null int64
 Low       247 non-null int64
 Close     247 non-null int64
 Volume    247 non-null int64
 '''

 print(cospi.head())
 # 0  26-Feb-16  1180000  1187000  1172000  1172000  176906
 # 26-Feb-16 -> 2016-2-26

 # 1. 날짜형식 수정(다국어 -> 한국식)
 Date = cospi['Date'] # cospi.Date
 kDate = [] # 빈list

 for d in Date :
     kDate.append(datetime.strptime(d, "%d-%b-%y"))

 print(kDate[:10])

 cospi['Date'] = kDate # (다국어 -> 한국식)
 print(cospi.head())

 # 2. 시계열 시각화
 import matplotlib.pyplot as plt

 # 1개 칼럼 추세그래프
 cospi['High'].plot(title = "Trend line of High column")
 plt.show()

 # 2개 칼럼 추세그래프
 cospi[['High', 'Low']].plot(title = "Trend line of High vs Low")
 plt.show()

 # 2. index 수정
 print(cospi.index)
 # RangeIndex(start=0, stop=247, step=1)

 # index 수정 -> Date 칼럼
 new_cospi = cospi.set_index('Date')
 print(new_cospi.head())

 # 년도별 검색
 '])
 '])

 # 월별 검색
 print(new_cospi['2016-02'])
 # 범위 검색
 print(new_cospi['2016-02':'2016-01'])

 new_cospi_HL = new_cospi[['High', 'Low']]
 new_cospi_HL['].plot(title="title")
 plt.show()

 new_cospi_HL['2016-02'].plot(title="title")
 plt.show()

 # 3. 이동평균 기능 

 # 5일, 10일, 20일
 roll_mean5 = pd.Series.rolling(new_cospi.High,
                   window=5, center=False).mean()
 print(roll_mean5)

 roll_mean10 = pd.Series.rolling(new_cospi.High,
                   window=10, center=False).mean()

 roll_mean20 = pd.Series.rolling(new_cospi.High,
                   window=20, center=False).mean()

 # roll mean 시각화
 new_cospi.High.plot(color='orange', label='High column')
 roll_mean5.plot(color='red', label='5day rolling mean')
 roll_mean10.plot(color='green', label='10day rolling mean')
 roll_mean20.plot(color='blue', label='20day rolling mean')
 plt.legend(loc='best')
 plt.show()