6 minute read

import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
bts = pd.read_csv('data/bus_bts.csv')
jeju_life = pd.read_csv('data/jeju_financial_life_data.csv')
weather = pd.read_csv('data/weather.csv', encoding='cp949')
rain = pd.read_csv('data/rain.csv', encoding='utf-8')
train.head()
id date bus_route_id in_out station_code station_name latitude longitude 6~7_ride 7~8_ride ... 9~10_ride 10~11_ride 11~12_ride 6~7_takeoff 7~8_takeoff 8~9_takeoff 9~10_takeoff 10~11_takeoff 11~12_takeoff 18~20_ride
0 0 2019-09-01 4270000 시외 344 제주썬호텔 33.48990 126.49373 0.0 1.0 ... 5.0 2.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 1 2019-09-01 4270000 시외 357 한라병원 33.48944 126.48508 1.0 4.0 ... 2.0 5.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 5.0
2 2 2019-09-01 4270000 시외 432 정존마을 33.48181 126.47352 1.0 1.0 ... 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0
3 3 2019-09-01 4270000 시내 1579 제주국제공항(600번) 33.50577 126.49252 0.0 17.0 ... 26.0 14.0 16.0 0.0 0.0 0.0 0.0 0.0 0.0 53.0
4 4 2019-09-01 4270000 시내 1646 중문관광단지입구 33.25579 126.41260 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0

5 rows × 21 columns

train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id             415423 non-null  int64  
 1   date           415423 non-null  object 
 2   bus_route_id   415423 non-null  int64  
 3   in_out         415423 non-null  object 
 4   station_code   415423 non-null  int64  
 5   station_name   415423 non-null  object 
 6   latitude       415423 non-null  float64
 7   longitude      415423 non-null  float64
 8   6~7_ride       415423 non-null  float64
 9   7~8_ride       415423 non-null  float64
 10  8~9_ride       415423 non-null  float64
 11  9~10_ride      415423 non-null  float64
 12  10~11_ride     415423 non-null  float64
 13  11~12_ride     415423 non-null  float64
 14  6~7_takeoff    415423 non-null  float64
 15  7~8_takeoff    415423 non-null  float64
 16  8~9_takeoff    415423 non-null  float64
 17  9~10_takeoff   415423 non-null  float64
 18  10~11_takeoff  415423 non-null  float64
 19  11~12_takeoff  415423 non-null  float64
 20  18~20_ride     415423 non-null  float64
dtypes: float64(15), int64(3), object(3)
memory usage: 66.6+ MB
bts.head()
user_card_id bus_route_id vhc_id geton_date geton_time geton_station_code geton_station_name getoff_date getoff_time getoff_station_code getoff_station_name user_category user_count
0 1.010010e+15 23000000 149793674 2019-09-10 06:34:45 360 노형오거리 2019-09-10 07:10:31 592.0 화북초등학교 1 1
1 1.010010e+15 23000000 149793674 2019-09-10 06:34:58 360 노형오거리 2019-09-10 06:56:27 3273.0 고산동산(광양방면) 1 1
2 1.019160e+15 21420000 149793535 2019-09-10 07:19:07 2495 동광환승정류장4(제주방면) 2019-09-10 07:40:29 431.0 정존마을 4 1
3 1.019150e+15 21420000 149793512 2019-09-09 09:14:47 3282 대정환승정류장(대정읍사무소) 2019-09-09 10:02:46 431.0 정존마을 1 1
4 1.010010e+15 21420000 149793512 2019-09-09 09:28:53 2820 삼정지에듀 2019-09-09 10:21:37 2972.0 제주국제공항(종점) 4 1
bts.info(null_counts=True)
/Users/charles/Documents/GitHub/busRidersPredic/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: null_counts is deprecated. Use show_counts instead
  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409414 entries, 0 to 2409413
Data columns (total 13 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   user_card_id         2409414 non-null  float64
 1   bus_route_id         2409414 non-null  int64  
 2   vhc_id               2409414 non-null  int64  
 3   geton_date           2409414 non-null  object 
 4   geton_time           2409414 non-null  object 
 5   geton_station_code   2409414 non-null  int64  
 6   geton_station_name   2409365 non-null  object 
 7   getoff_date          1513678 non-null  object 
 8   getoff_time          1513678 non-null  object 
 9   getoff_station_code  1513678 non-null  float64
 10  getoff_station_name  1513639 non-null  object 
 11  user_category        2409414 non-null  int64  
 12  user_count           2409414 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 239.0+ MB
pd.options.display.max_info_columns
100
pd.options.display.max_info_rows
1690785
jeju_life.head()
zip_cd year_month x_axis y_axis sex age job_majorc job_smallc job_public job_profession ... avg_debt avg_debt_credit avg_debt_noneb avg_debt_mortgage avg_debt_deposit avg_debt_collateral avg_credit_rat medium_resid_rat large_resid_rat vehicle_own_rat
0 63299 201902 126.565889 33.513210 1 54 0.0 0.166667 0.000000 0.000000 ... 24285196 11419148 8863595 55550000 0 55550000 5 -999999.0 -999999.0 0.000000
1 63560 201902 126.497121 33.247225 1 59 0.0 0.125000 0.000000 0.000000 ... 108571077 25896955 13317443 0 200000000 74421721 4 -999999.0 -999999.0 0.000000
2 63560 201902 126.497121 33.247225 2 49 0.0 0.250000 0.000000 0.000000 ... 26424335 12015311 12015311 40833359 0 40833359 2 -999999.0 -999999.0 0.000000
3 63560 201902 126.497121 33.247225 1 54 0.0 0.181818 0.090909 0.090909 ... 647716901 39952556 21558153 36697792 0 706300496 4 -999999.0 -999999.0 0.166667
4 63560 201902 126.497121 33.247225 1 49 0.0 0.250000 0.000000 0.000000 ... 248232725 35712501 5568751 0 0 626810674 5 -999999.0 -999999.0 0.000000

5 rows × 27 columns

jeju_life.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10422 entries, 0 to 10421
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   zip_cd               10422 non-null  int64  
 1   year_month           10422 non-null  int64  
 2   x_axis               10422 non-null  float64
 3   y_axis               10422 non-null  float64
 4   sex                  10422 non-null  int64  
 5   age                  10422 non-null  int64  
 6   job_majorc           10422 non-null  float64
 7   job_smallc           10422 non-null  float64
 8   job_public           10422 non-null  float64
 9   job_profession       10422 non-null  float64
 10  job_self             10422 non-null  float64
 11  job_none             10422 non-null  float64
 12  job_other            10422 non-null  float64
 13  avg_income           10422 non-null  int64  
 14  med_income           10422 non-null  int64  
 15  avg_spend            10422 non-null  int64  
 16  avg_foreign_spend    10422 non-null  int64  
 17  avg_debt             10422 non-null  int64  
 18  avg_debt_credit      10422 non-null  int64  
 19  avg_debt_noneb       10422 non-null  int64  
 20  avg_debt_mortgage    10422 non-null  int64  
 21  avg_debt_deposit     10422 non-null  int64  
 22  avg_debt_collateral  10422 non-null  int64  
 23  avg_credit_rat       10422 non-null  int64  
 24  medium_resid_rat     10422 non-null  float64
 25  large_resid_rat      10422 non-null  float64
 26  vehicle_own_rat      10422 non-null  float64
dtypes: float64(12), int64(15)
memory usage: 2.1 MB
weather.head()
날짜 현재일기 현재기온 체감온도 일강수
0 2019-09-01 흐림 26.2 75.0 0
1 2019-09-02 비 끝남 22.5 72.0 25.7
2 2019-09-03 흐림 25.4 77.0 12.1
3 2019-09-04 약한비단속 24.2 75.0 34.3
4 2019-09-05 구름많음 27.6 79.0 0.1
weather.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      46 non-null     object 
 1   현재일기    46 non-null     object 
 2   현재기온    46 non-null     float64
 3   체감온도    46 non-null     float64
 4   일강수     46 non-null     object 
dtypes: float64(2), object(3)
memory usage: 1.9+ KB
rain.head()
지점 일시 기온(°C) 강수량(mm)
0 gosan 2019-09-01 23.916667 0.00
1 gosan 2019-09-02 23.700000 12.10
2 gosan 2019-09-03 23.550000 1.90
3 gosan 2019-09-04 23.533333 12.84
4 gosan 2019-09-05 26.033333 0.00
rain.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   지점       184 non-null    object 
 1   일시       184 non-null    object 
 2   기온(°C)   184 non-null    float64
 3   강수량(mm)  66 non-null     float64
dtypes: float64(2), object(2)
memory usage: 5.9+ KB
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# 내부에 결과를 출력하도록 설정
%matplotlib inline

# 시각화 한글 폰트 설정
# window 사용자
# plt.rc('font', family='Malgun Gothic')
# macOS 사용자
plt.rc('font', family='AppleGothic')

plt.rc('axes', unicode_minus=False)

# 분석에 문제가 없는 경고 메시지는 숨긴다.
import warnings
warnings.filterwarnings('ignore')
# 학습 데이터 복사본 생성
traindata = train.copy()
# 타깃 변수(퇴근시간 승차인원) 분포 시각화
sns.distplot(traindata['18~20_ride'], kde=False, bins=50)
plt.axis([0, 50, 0, 450000])
plt.title("퇴근 시간 승차 인원 히스토그램")
plt.show()

png

traindata.groupby('18~20_ride').size().head()
18~20_ride
0.0    296528
1.0     44268
2.0     23752
3.0     13560
4.0      8630
dtype: int64
# 요일 변수 생성
# 날짜 형으로 변환
traindata['date'] = pd.to_datetime(traindata['date'])
# 요일을 문자형으로 추출해 변수 생성
traindata['weekday'] = traindata['date'].dt.strftime('%a')
traindata[['date', 'weekday']].head()
date weekday
0 2019-09-01 Sun
1 2019-09-01 Sun
2 2019-09-01 Sun
3 2019-09-01 Sun
4 2019-09-01 Sun
traindata[['date', 'weekday']].info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   date     415423 non-null  datetime64[ns]
 1   weekday  415423 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 6.3+ MB
# 요일별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="weekday", y="18~20_ride", data=traindata)
plt.title("요일에 따른 퇴근 시간 평균 탑승객 수")     # number of riders at rushour by weekday
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>

png

# 버스 종류별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="in_out", y="18~20_ride", data=traindata)
plt.title("버스 종류에 따른 평균 탑승객 수")       # number of riders by busType
plt.show()

png

# 일별 출퇴근 시간의 총 승차 인원 데이터 생성
traindata['8~10_ride'] = traindata['8~9_ride'] + traindata['9~10_ride']
eda_data = traindata.groupby('date')[['18~20_ride', '8~10_ride']].agg('sum').reset_index()
eda_data.head()
date 18~20_ride 8~10_ride
0 2019-09-01 11537.0 11957.0
1 2019-09-02 20561.0 21844.0
2 2019-09-03 19743.0 25756.0
3 2019-09-04 20133.0 21472.0
4 2019-09-05 20392.0 26331.0
# 일별 출퇴근 시간 탑승객 수 시각화
plt.plot("date", "18~20_ride", "g-", label="퇴근 시간 탑승객 수", data=eda_data)
plt.plot("date", "8~10_ride", "b-", label="출근 시간 탑승객 수", data=eda_data)
plt.gcf().autofmt_xdate()         # x축의 라벨이 서로 겹치지 않도록 설정
plt.legend(loc=0)                 # 그래프 상에서 최적의 위치에 범례 표시
plt.title("일별 출퇴근 시간 탑승객 수")
plt.show()

png

Updated:

Leave a comment