[RMSE] BusRidersPredic

6 minute read

import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
bts = pd.read_csv('data/bus_bts.csv')
jeju_life = pd.read_csv('data/jeju_financial_life_data.csv')
weather = pd.read_csv('data/weather.csv', encoding='cp949')
rain = pd.read_csv('data/rain.csv', encoding='utf-8')

train.head()

	id	date	bus_route_id	in_out	station_code	station_name	latitude	longitude	6~7_ride	7~8_ride	...	9~10_ride	10~11_ride	11~12_ride	9~10_takeoff	18~20_ride
0	0	2019-09-01	4270000	시외	344	제주썬호텔	33.48990	126.49373	0.0	1.0	...	5.0	2.0	6.0	0.0	0.0
1	1	2019-09-01	4270000	시외	357	한라병원	33.48944	126.48508	1.0	4.0	...	2.0	5.0	6.0	0.0	5.0
2	2	2019-09-01	4270000	시외	432	정존마을	33.48181	126.47352	1.0	1.0	...	2.0	0.0	0.0	0.0	2.0
3	3	2019-09-01	4270000	시내	1579	제주국제공항(600번)	33.50577	126.49252	0.0	17.0	...	26.0	14.0	16.0	0.0	53.0
4	4	2019-09-01	4270000	시내	1646	중문관광단지입구	33.25579	126.41260	0.0	0.0	...	0.0	0.0	0.0	1.0	0.0

5 rows × 21 columns

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 21 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 id             415423 non-null  int64  
 date           415423 non-null  object 
 bus_route_id   415423 non-null  int64  
 in_out         415423 non-null  object 
 station_code   415423 non-null  int64  
 station_name   415423 non-null  object 
 latitude       415423 non-null  float64
 longitude      415423 non-null  float64
 6~7_ride       415423 non-null  float64
 7~8_ride       415423 non-null  float64
8~9_ride       415423 non-null  float64
9~10_ride      415423 non-null  float64
10~11_ride     415423 non-null  float64
11~12_ride     415423 non-null  float64
6~7_takeoff    415423 non-null  float64
7~8_takeoff    415423 non-null  float64
8~9_takeoff    415423 non-null  float64
9~10_takeoff   415423 non-null  float64
10~11_takeoff  415423 non-null  float64
11~12_takeoff  415423 non-null  float64
18~20_ride     415423 non-null  float64
dtypes: float64(15), int64(3), object(3)
memory usage: 66.6+ MB

bts.head()

	user_card_id	bus_route_id	vhc_id	geton_date	geton_time	geton_station_code	geton_station_name	getoff_date	getoff_time	getoff_station_code	getoff_station_name	user_category	user_count
0	1.010010e+15	23000000	149793674	2019-09-10	06:34:45	360	노형오거리	2019-09-10	07:10:31	592.0	화북초등학교	1	1
1	1.010010e+15	23000000	149793674	2019-09-10	06:34:58	360	노형오거리	2019-09-10	06:56:27	3273.0	고산동산(광양방면)	1	1
2	1.019160e+15	21420000	149793535	2019-09-10	07:19:07	2495	동광환승정류장4(제주방면)	2019-09-10	07:40:29	431.0	정존마을	4	1
3	1.019150e+15	21420000	149793512	2019-09-09	09:14:47	3282	대정환승정류장(대정읍사무소)	2019-09-09	10:02:46	431.0	정존마을	1	1
4	1.010010e+15	21420000	149793512	2019-09-09	09:28:53	2820	삼정지에듀	2019-09-09	10:21:37	2972.0	제주국제공항(종점)	4	1

bts.info(null_counts=True)

/Users/charles/Documents/GitHub/busRidersPredic/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: FutureWarning: null_counts is deprecated. Use show_counts instead
  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2409414 entries, 0 to 2409413
Data columns (total 13 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   user_card_id         2409414 non-null  float64
 1   bus_route_id         2409414 non-null  int64  
 2   vhc_id               2409414 non-null  int64  
 3   geton_date           2409414 non-null  object 
 4   geton_time           2409414 non-null  object 
 5   geton_station_code   2409414 non-null  int64  
 6   geton_station_name   2409365 non-null  object 
 7   getoff_date          1513678 non-null  object 
 8   getoff_time          1513678 non-null  object 
 9   getoff_station_code  1513678 non-null  float64
 10  getoff_station_name  1513639 non-null  object 
 11  user_category        2409414 non-null  int64  
 12  user_count           2409414 non-null  int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 239.0+ MB

pd.options.display.max_info_columns

pd.options.display.max_info_rows

jeju_life.head()

	zip_cd	year_month	x_axis	y_axis	sex	age	job_smallc	job_public	job_profession	...	avg_debt	avg_debt_credit	avg_debt_noneb	avg_debt_mortgage	avg_debt_deposit	avg_debt_collateral	avg_credit_rat	medium_resid_rat	large_resid_rat	vehicle_own_rat
0	63299	201902	126.565889	33.513210	1	54	0.166667	0.000000	0.000000	...	24285196	11419148	8863595	55550000	0	55550000	5	-999999.0	-999999.0	0.000000
1	63560	201902	126.497121	33.247225	1	59	0.125000	0.000000	0.000000	...	108571077	25896955	13317443	0	200000000	74421721	4	-999999.0	-999999.0	0.000000
2	63560	201902	126.497121	33.247225	2	49	0.250000	0.000000	0.000000	...	26424335	12015311	12015311	40833359	0	40833359	2	-999999.0	-999999.0	0.000000
3	63560	201902	126.497121	33.247225	1	54	0.181818	0.090909	0.090909	...	647716901	39952556	21558153	36697792	0	706300496	4	-999999.0	-999999.0	0.166667
4	63560	201902	126.497121	33.247225	1	49	0.250000	0.000000	0.000000	...	248232725	35712501	5568751	0	0	626810674	5	-999999.0	-999999.0	0.000000

5 rows × 27 columns

jeju_life.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10422 entries, 0 to 10421
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 zip_cd               10422 non-null  int64  
 year_month           10422 non-null  int64  
 x_axis               10422 non-null  float64
 y_axis               10422 non-null  float64
 sex                  10422 non-null  int64  
 age                  10422 non-null  int64  
 job_majorc           10422 non-null  float64
 job_smallc           10422 non-null  float64
 job_public           10422 non-null  float64
 job_profession       10422 non-null  float64
job_self             10422 non-null  float64
job_none             10422 non-null  float64
job_other            10422 non-null  float64
avg_income           10422 non-null  int64  
med_income           10422 non-null  int64  
avg_spend            10422 non-null  int64  
avg_foreign_spend    10422 non-null  int64  
avg_debt             10422 non-null  int64  
avg_debt_credit      10422 non-null  int64  
avg_debt_noneb       10422 non-null  int64  
avg_debt_mortgage    10422 non-null  int64  
avg_debt_deposit     10422 non-null  int64  
avg_debt_collateral  10422 non-null  int64  
avg_credit_rat       10422 non-null  int64  
medium_resid_rat     10422 non-null  float64
large_resid_rat      10422 non-null  float64
vehicle_own_rat      10422 non-null  float64
dtypes: float64(12), int64(15)
memory usage: 2.1 MB

weather.head()

	날짜	현재일기	현재기온	체감온도	일강수
0	2019-09-01	흐림	26.2	75.0	0
1	2019-09-02	비 끝남	22.5	72.0	25.7
2	2019-09-03	흐림	25.4	77.0	12.1
3	2019-09-04	약한비단속	24.2	75.0	34.3
4	2019-09-05	구름많음	27.6	79.0	0.1

weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46 entries, 0 to 45
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      46 non-null     object 
 1   현재일기    46 non-null     object 
 2   현재기온    46 non-null     float64
 3   체감온도    46 non-null     float64
 4   일강수     46 non-null     object 
dtypes: float64(2), object(3)
memory usage: 1.9+ KB

rain.head()

	지점	일시	기온(°C)	강수량(mm)
0	gosan	2019-09-01	23.916667	0.00
1	gosan	2019-09-02	23.700000	12.10
2	gosan	2019-09-03	23.550000	1.90
3	gosan	2019-09-04	23.533333	12.84
4	gosan	2019-09-05	26.033333	0.00

rain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   지점       184 non-null    object 
 1   일시       184 non-null    object 
 2   기온(°C)   184 non-null    float64
 3   강수량(mm)  66 non-null     float64
dtypes: float64(2), object(2)
memory usage: 5.9+ KB

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# 내부에 결과를 출력하도록 설정
%matplotlib inline

# 시각화 한글 폰트 설정
# window 사용자
# plt.rc('font', family='Malgun Gothic')
# macOS 사용자
plt.rc('font', family='AppleGothic')

plt.rc('axes', unicode_minus=False)

# 분석에 문제가 없는 경고 메시지는 숨긴다.
import warnings
warnings.filterwarnings('ignore')

# 학습 데이터 복사본 생성
traindata = train.copy()

# 타깃 변수(퇴근시간 승차인원) 분포 시각화
sns.distplot(traindata['18~20_ride'], kde=False, bins=50)
plt.axis([0, 50, 0, 450000])
plt.title("퇴근 시간 승차 인원 히스토그램")
plt.show()

png

traindata.groupby('18~20_ride').size().head()

18~20_ride
0    296528
0     44268
0     23752
0     13560
0      8630
dtype: int64

# 요일 변수 생성
# 날짜 형으로 변환
traindata['date'] = pd.to_datetime(traindata['date'])
# 요일을 문자형으로 추출해 변수 생성
traindata['weekday'] = traindata['date'].dt.strftime('%a')

traindata[['date', 'weekday']].head()

	date	weekday
0	2019-09-01	Sun
1	2019-09-01	Sun
2	2019-09-01	Sun
3	2019-09-01	Sun
4	2019-09-01	Sun

traindata[['date', 'weekday']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415423 entries, 0 to 415422
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   date     415423 non-null  datetime64[ns]
 1   weekday  415423 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 6.3+ MB

# 요일별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="weekday", y="18~20_ride", data=traindata)
plt.title("요일에 따른 퇴근 시간 평균 탑승객 수")     # number of riders at rushour by weekday
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

png

# 버스 종류별 퇴근시간 평균 탑승객 수 시각화
sns.barplot(x="in_out", y="18~20_ride", data=traindata)
plt.title("버스 종류에 따른 평균 탑승객 수")       # number of riders by busType
plt.show()

png

# 일별 출퇴근 시간의 총 승차 인원 데이터 생성
traindata['8~10_ride'] = traindata['8~9_ride'] + traindata['9~10_ride']
eda_data = traindata.groupby('date')[['18~20_ride', '8~10_ride']].agg('sum').reset_index()

eda_data.head()

	date	18~20_ride	8~10_ride
0	2019-09-01	11537.0	11957.0
1	2019-09-02	20561.0	21844.0
2	2019-09-03	19743.0	25756.0
3	2019-09-04	20133.0	21472.0
4	2019-09-05	20392.0	26331.0

# 일별 출퇴근 시간 탑승객 수 시각화
plt.plot("date", "18~20_ride", "g-", label="퇴근 시간 탑승객 수", data=eda_data)
plt.plot("date", "8~10_ride", "b-", label="출근 시간 탑승객 수", data=eda_data)
plt.gcf().autofmt_xdate()         # x축의 라벨이 서로 겹치지 않도록 설정
plt.legend(loc=0)                 # 그래프 상에서 최적의 위치에 범례 표시
plt.title("일별 출퇴근 시간 탑승객 수")
plt.show()

png

Share on

Twitter Facebook LinkedIn

안철현(Charles An)

[RMSE] BusRidersPredic

Share on

Leave a comment

You may also enjoy

파사트 헤드라이트 램프(HID) 교체

Clustering with K-Means

Creating Featues

Tips for Creating Features