已部署到shinyapps.io,详见SHMetro
0.配置环境和加载包
## encoding
options(encoding = "UTF-8") ## for chines
## use getOption("encoding") to see if things were changed
loc <- function(os, language = "english") {
switch(language,
english = ifelse(os == "Windows", "English_United States.1252", "en_US.UTF-8"),
chinese = ifelse(os == "Windows", "Chinese", "zh_CN.utf-8"))
}
## set locale
Sys.setlocale(category = "LC_ALL", loc(Sys.info()[["sysname"]], "chinese"))
##加载工作环境和所需包
setwd("/Users/jeevanyue/Rproject/map/SHMetro")
library(data.table)
library(bit64)
library(dplyr)
library(tidyr)
library(scales)
library(lubridate) #日期处理包
library(shiny)
library(leaflet)
library(lattice)
library(plotly)
library(chorddiag) #绘制chord
1. 地铁数据
#地铁站进站数据和出站数据
shmetro_in <- fread("data/shmetro_in.csv",encoding="UTF-8")
shmetro_out <- fread("data/shmetro_out.csv",encoding="UTF-8")
## 进出地铁站数据
shmetro_line_in_out <- fread("data/shmetro_line_in_out.csv",encoding="UTF-8")
## 进出地铁站关联
in_out <- shmetro_line_in_out %>%
spread(line_out,count)
in_out[is.na(in_out)]<-0
## 地铁站经纬度
stations <- fread("data/stations.csv",encoding="UTF-8")
stations <- stations %>%
select(c(1:5)) %>%
arrange(line,line_id)
stations_no <- nrow(stations)
for (i in 1:stations_no) {
s <- stations$station[i]
stations$lines[i] <- paste(stations[stations$station==s,]$line,sep="",collapse="/")
}
1.1 地铁站经纬度
stations <- fread("data/stations.csv",encoding="UTF-8")
stations <- stations %>%
select(c(1:5)) %>%
arrange(line,line_id)
stations_no <- nrow(stations)
for (i in 1:stations_no) {
s <- stations$station[i]
stations$lines[i] <- paste(stations[stations$station==s,]$line,sep="",collapse="/")
}
invisible(gc())
1.2 交通卡交易数据
交通卡的交易信息有7个字段,分别是:卡号、交易日期、交易时间、站点名称、行业名称、交易金额、交易性质。
卡号:交通卡卡号
交易日期:日期格式yyyy-mm-dd
交易时间:时间个是hh:mm:ss
站点名称:内容包括线路和站名,如:"1号线莘庄"
行业名称:都是"地铁"
交易金额:0和大于0的值,0表示进站,大于0的值表示出战
交易性质:"优惠"和"非优惠"
#mac下用如下读取
system.time(trade <- read.csv("/Users/jeevanyue/Desktop/SPTCC-20150401.csv",header = F,sep=",", fileEncoding = "GB2312"))
#windows下用如下读取
#system.time(trade <- fread("SPTCC-20150401/SPTCC-20150401.csv",integer64='character',stringsAsFactors=F))
#trade <- read.csv('data/SPTCC-20150401_Sample.txt',header=T,encoding='UTF-8',stringsAsFactors = F)
#重命名
names(trade) <- c('card_id','date','time','station','vehicle','money','property')
#筛选地铁数据
trade_metro <- trade %>%
filter(vehicle=='地铁')
rm(trade)
invisible(gc())
#将"station"(原含义为线路和站名),分为"line"和"station"
trade_metro <- trade_metro %>%
separate(station, c('line', 'station'), sep = '号线')
invisible(gc())
#按五分钟统计时间,向上取整
trade_metro <- trade_metro %>%
mutate(M5=ceiling(period_to_seconds(hms(time))/300))
invisible(gc())
#删除不需要的字段
trade_metro <- trade_metro %>%
select(-vehicle,-property,-date)
invisible(gc())
#时间格式
#trade_metro$time <- strptime(paste("2015-04-01", trade_metro$time, sep=' '), "%Y-%m-%d %H:%M:%S", tz = "GMT")
1.3 处理异常值
## 对与stations地铁站名不一致的trade数据进行处理
trade_metro[trade_metro$station=="淞浜路",]$station <- "淞滨路"
trade_metro[trade_metro$station=="大木桥路 ",]$station <- "大木桥路"
trade_metro[trade_metro$station=="上海大学站",]$station <- "上海大学"
1.4 进/出站数据
## 进站数据
trade_metro_in <- trade_metro %>%
filter(money==0) %>%
select(card_id,"time_in"=time,"line_in"=line,"station_in"=station,"M5_in"=M5)
## 出站数据
trade_metro_out <- trade_metro %>%
filter(money>0)%>%
select(card_id,"time_out"=time,"line_out"=line,"station_out"=station,money,"M5_out"=M5)
1.5 虚拟换乘
上海火车站为虚拟换乘,删除半小时内3/4换1和1换3/4的数据
3/4换1的数据
## 3/4换1的数据,统计发现在上海火车站3/4号线出站以3号线名义出站
trade_metro_out_34 <- trade_metro_out %>%
filter(station_out=='上海火车站') %>%
filter(line_out==3 | line_out==4)
trade_metro_in_1 <- trade_metro_in %>%
filter(station_in=='上海火车站', line_in==1)
## merge出站和进站的