[RMSE] BusRidersPredic
import pandas as pd
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
bts = pd.read_csv('data/bus_bts.csv')
jeju_life = pd.read_csv('data/jeju_financial_life_data.csv')
weather = pd.read_csv('data/weather.csv', encoding='cp949')
rain = pd.read_csv('data/rain.csv', encoding='utf-8')
train.head()
id | date | bus_route_id | in_out | station_code | station_name | latitude | longitude | 6~7_ride | 7~8_ride | ... | 9~10_ride | 10~11_ride | 11~12_ride | 6~7_takeoff | 7~8_takeoff | 8~9_takeoff | 9~10_takeoff | 10~11_takeoff | 11~12_takeoff | 18~20_ride | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2019-09-01 | 4270000 | 시외 | 344 | 제주썬호텔 | 33.48990 | 126.49373 | 0.0 | 1.0 | ... | 5.0 | 2.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1 | 2019-09-01 | 4270000 | 시외 | 357 | 한라병원 | 33.48944 | 126.48508 | 1.0 | 4.0 | ... | 2.0 | 5.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5.0 |
2 | 2 | 2019-09-01 | 4270000 | 시외 | 432 | 정존마을 | 33.48181 | 126.47352 | 1.0 | 1.0 | ... | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 |
3 | 3 | 2019-09-01 | 4270000 | 시내 | 1579 | 제주국제공항(600번) | 33.50577 | 126.49252 | 0.0 | 17.0 | ... | 26.0 | 14.0 | 16.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 53.0 |
4 | 4 | 2019-09-01 | 4270000 | 시내 | 1646 | 중문관광단지입구 | 33.25579 | 126.41260 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
5 rows × 21 columns
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 415423 non-null int64
1 date 415423 non-null object
2 bus_route_id 415423 non-null int64
3 in_out 415423 non-null object
4 station_code 415423 non-null int64
5 station_name 415423 non-null object
6 latitude 415423 non-null float64
7 longitude 415423 non-null float64
8 6~7_ride 415423 non-null float64
9 7~8_ride 415423 non-null float64
10 8~9_ride 415423 non-null float64
11 9~10_ride 415423 non-null float64
12 10~11_ride 415423 non-null float64
13 11~12_ride 415423 non-null float64
14 6~7_takeoff 415423 non-null float64
15 7~8_takeoff 415423 non-null float64
16 8~9_takeoff 415423 non-null float64
17 9~10_takeoff 415423 non-null float64
18 10~11_takeoff 415423 non-null float64
19 11~12_takeoff 415423 non-null float64
20 18~20_ride 415423 non-null float64
dtypes: float64(15), int64(3), object(3)
memory usage: 66.6+ MB
bts.head()
user_card_id | bus_route_id | vhc_id | geton_date | geton_time | geton_station_code | geton_station_name | getoff_date | getoff_time | getoff_station_code | getoff_station_name | user_category | user_count | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.010010e+15 | 23000000 | 149793674 | 2019-09-10 | 06:34:45 | 360 | 노형오거리 | 2019-09-10 | 07:10:31 | 592.0 | 화북초등학교 | 1 | 1 |
1 | 1.010010e+15 | 23000000 | 149793674 | 2019-09-10 | 06:34:58 | 360 | 노형오거리 | 2019-09-10 | 06:56:27 | 3273.0 | 고산동산(광양방면) | 1 | 1 |
2 | 1.019160e+15 | 21420000 | 149793535 | 2019-09-10 | 07:19:07 | 2495 | 동광환승정류장4(제주방면) | 2019-09-10 | 07:40:29 | 431.0 | 정존마을 | 4 | 1 |
3 | 1.019150e+15 | 21420000 | 149793512 | 2019-09-09 | 09:14:47 | 3282 | 대정환승정류장(대정읍사무소) | 2019-09-09 | 10:02:46 | 431.0 | 정존마을 | 1 | 1 |
4 | 1.010010e+15 | 21420000 | 149793512 | 2019-09-09 | 09:28:53 | 2820 | 삼정지에듀 | 2019-09-09 | 10:21:37 | 2972.0 | 제주국제공항(종점) | 4 | 1 |
bts.info(null_counts=True)
/Users/charles/Documents/GitHub/busRidersPredic/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: null_counts is deprecated. Use show_counts instead
"""Entry point for launching an IPython kernel.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409414 entries, 0 to 2409413
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_card_id 2409414 non-null float64
1 bus_route_id 2409414 non-null int64
2 vhc_id 2409414 non-null int64
3 geton_date 2409414 non-null object
4 geton_time 2409414 non-null object
5 geton_station_code 2409414 non-null int64
6 geton_station_name 2409365 non-null object
7 getoff_date 1513678 non-null object
8 getoff_time 1513678 non-null object
9 getoff_station_code 1513678 non-null float64
10 getoff_station_name 1513639 non-null object
11 user_category 2409414 non-null int64
12 user_count 2409414 non-null int64
dtypes: float64(2), int64(5), object(6)
memory usage: 239.0+ MB
pd.options.display.max_info_columns
100
pd.options.display.max_info_rows
1690785
jeju_life.head()
zip_cd | year_month | x_axis | y_axis | sex | age | job_majorc | job_smallc | job_public | job_profession | ... | avg_debt | avg_debt_credit | avg_debt_noneb | avg_debt_mortgage | avg_debt_deposit | avg_debt_collateral | avg_credit_rat | medium_resid_rat | large_resid_rat | vehicle_own_rat | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63299 | 201902 | 126.565889 | 33.513210 | 1 | 54 | 0.0 | 0.166667 | 0.000000 | 0.000000 | ... | 24285196 | 11419148 | 8863595 | 55550000 | 0 | 55550000 | 5 | -999999.0 | -999999.0 | 0.000000 |
1 | 63560 | 201902 | 126.497121 | 33.247225 | 1 | 59 | 0.0 | 0.125000 | 0.000000 | 0.000000 | ... | 108571077 | 25896955 | 13317443 | 0 | 200000000 | 74421721 | 4 | -999999.0 | -999999.0 | 0.000000 |
2 | 63560 | 201902 | 126.497121 | 33.247225 | 2 | 49 | 0.0 | 0.250000 | 0.000000 | 0.000000 | ... | 26424335 | 12015311 | 12015311 | 40833359 | 0 | 40833359 | 2 | -999999.0 | -999999.0 | 0.000000 |
3 | 63560 | 201902 | 126.497121 | 33.247225 | 1 | 54 | 0.0 | 0.181818 | 0.090909 | 0.090909 | ... | 647716901 | 39952556 | 21558153 | 36697792 | 0 | 706300496 | 4 | -999999.0 | -999999.0 | 0.166667 |
4 | 63560 | 201902 | 126.497121 | 33.247225 | 1 | 49 | 0.0 | 0.250000 | 0.000000 | 0.000000 | ... | 248232725 | 35712501 | 5568751 | 0 | 0 | 626810674 | 5 | -999999.0 | -999999.0 | 0.000000 |
5 rows × 27 columns
jeju_life.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10422 entries, 0 to 10421
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 zip_cd 10422 non-null int64
1 year_month 10422 non-null int64
2 x_axis 10422 non-null float64
3 y_axis 10422 non-null float64
4 sex 10422 non-null int64
5 age 10422 non-null int64
6 job_majorc 10422 non-null float64
7 job_smallc 10422 non-null float64
8 job_public 10422 non-null float64
9 job_profession 10422 non-null float64
10 job_self 10422 non-null float64
11 job_none 10422 non-null float64
12 job_other 10422 non-null float64
13 avg_income 10422 non-null int64
14 med_income 10422 non-null int64
15 avg_spend 10422 non-null int64
16 avg_foreign_spend 10422 non-null int64
17 avg_debt 10422 non-null int64
18 avg_debt_credit 10422 non-null int64
19 avg_debt_noneb 10422 non-null int64
20 avg_debt_mortgage 10422 non-null int64
21 avg_debt_deposit 10422 non-null int64
22 avg_debt_collateral 10422 non-null int64
23 avg_credit_rat 10422 non-null int64
24 medium_resid_rat 10422 non-null float64
25 large_resid_rat 10422 non-null float64
26 vehicle_own_rat 10422 non-null float64
dtypes: float64(12), int64(15)
memory usage: 2.1 MB
weather.head()
날짜 | 현재일기 | 현재기온 | 체감온도 | 일강수 | |
---|---|---|---|---|---|
0 | 2019-09-01 | 흐림 | 26.2 | 75.0 | 0 |
1 | 2019-09-02 | 비 끝남 | 22.5 | 72.0 | 25.7 |
2 | 2019-09-03 | 흐림 | 25.4 | 77.0 | 12.1 |
3 | 2019-09-04 | 약한비단속 | 24.2 | 75.0 | 34.3 |
4 | 2019-09-05 | 구름많음 | 27.6 | 79.0 | 0.1 |
weather.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 날짜 46 non-null object
1 현재일기 46 non-null object
2 현재기온 46 non-null float64
3 체감온도 46 non-null float64
4 일강수 46 non-null object
dtypes: float64(2), object(3)
memory usage: 1.9+ KB
rain.head()
지점 | 일시 | 기온(°C) | 강수량(mm) | |
---|---|---|---|---|
0 | gosan | 2019-09-01 | 23.916667 | 0.00 |
1 | gosan | 2019-09-02 | 23.700000 | 12.10 |
2 | gosan | 2019-09-03 | 23.550000 | 1.90 |
3 | gosan | 2019-09-04 | 23.533333 | 12.84 |
4 | gosan | 2019-09-05 | 26.033333 | 0.00 |
rain.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 지점 184 non-null object
1 일시 184 non-null object
2 기온(°C) 184 non-null float64
3 강수량(mm) 66 non-null float64
dtypes: float64(2), object(2)
memory usage: 5.9+ KB
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# 내부에 결과를 출력하도록 설정
%matplotlib inline
# 시각화 한글 폰트 설정
# window 사용자
# plt.rc('font', family='Malgun Gothic')
# macOS 사용자
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)
# 분석에 문제가 없는 경고 메시지는 숨긴다.
import warnings
warnings.filterwarnings('ignore')
# 학습 데이터 복사본 생성
traindata = train.copy()
# 타깃 변수(퇴근시간 승차인원) 분포 시각화
sns.distplot(traindata['18~20_ride'], kde=False, bins=50)
plt.axis([0, 50, 0, 450000])
plt.title("퇴근 시간 승차 인원 히스토그램")
plt.show()
traindata.groupby('18~20_ride').size().head()
18~20_ride
0.0 296528
1.0 44268
2.0 23752
3.0 13560
4.0 8630
dtype: int64
# 요일 변수 생성
# 날짜 형으로 변환
traindata['date'] = pd.to_datetime(traindata['date'])
# 요일을 문자형으로 추출해 변수 생성
traindata['weekday'] = traindata['date'].dt.strftime('%a')
traindata[['date', 'weekday']].head()
date | weekday | |
---|---|---|
0 | 2019-09-01 | Sun |
1 | 2019-09-01 | Sun |
2 | 2019-09-01 | Sun |
3 | 2019-09-01 | Sun |
4 | 2019-09-01 | Sun |
traindata[['date', 'weekday']].info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 415423 non-null datetime64[ns]
1 weekday 415423 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 6.3+ MB
# 요일별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="weekday", y="18~20_ride", data=traindata)
plt.title("요일에 따른 퇴근 시간 평균 탑승객 수") # number of riders at rushour by weekday
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# 버스 종류별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="in_out", y="18~20_ride", data=traindata)
plt.title("버스 종류에 따른 평균 탑승객 수") # number of riders by busType
plt.show()
# 일별 출퇴근 시간의 총 승차 인원 데이터 생성
traindata['8~10_ride'] = traindata['8~9_ride'] + traindata['9~10_ride']
eda_data = traindata.groupby('date')[['18~20_ride', '8~10_ride']].agg('sum').reset_index()
eda_data.head()
date | 18~20_ride | 8~10_ride | |
---|---|---|---|
0 | 2019-09-01 | 11537.0 | 11957.0 |
1 | 2019-09-02 | 20561.0 | 21844.0 |
2 | 2019-09-03 | 19743.0 | 25756.0 |
3 | 2019-09-04 | 20133.0 | 21472.0 |
4 | 2019-09-05 | 20392.0 | 26331.0 |
# 일별 출퇴근 시간 탑승객 수 시각화
plt.plot("date", "18~20_ride", "g-", label="퇴근 시간 탑승객 수", data=eda_data)
plt.plot("date", "8~10_ride", "b-", label="출근 시간 탑승객 수", data=eda_data)
plt.gcf().autofmt_xdate() # x축의 라벨이 서로 겹치지 않도록 설정
plt.legend(loc=0) # 그래프 상에서 최적의 위치에 범례 표시
plt.title("일별 출퇴근 시간 탑승객 수")
plt.show()
Leave a comment