一切先從讀檔開始
ubike <- fread("data/ubikebyhourutf8/ubike-hour-201502-utf8.csv",
data.table = FALSE,
colClasses=c("factor","integer","integer","factor","factor","numeric",
"numeric","integer","numeric","integer","integer","numeric",
"numeric","integer","integer","numeric","numeric","numeric",
"numeric","numeric","numeric"))
整理一下資料
ubike1 <- filter(ubike, sno==1) %>%
mutate(sbi.range=max.sbi-min.sbi) %>%
mutate(is.rushhours=cut(hour, breaks=c(0, 8, 10, 17, 20, 24),
labels = c(0,1,0,1,0), right=FALSE)) %>%
mutate(is.weekday=ifelse(strftime(date, "%u") < 6, 1, 0))
tab1 <- filter(ubike1, is.rushhours==1, is.weekday==1) %>%
group_by(tot) %>%
summarise(min(sbi.range), mean(sbi.range), max(sbi.range))
df2 <- filter(ubike, sno==1) %>%
mutate(is.rain=rainfall>1) %>%
mutate(is.rain=factor(is.rain, levels=c(FALSE, TRUE),
labels = c("晴天","雨天"))) %>%
select(date, hour, tot, avg.sbi, avg.bemp, temp, is.rain) %>%
group_by(date, hour, is.rain) %>%
summarise(rate.sbi=mean(avg.sbi)/tot) %>%
group_by(hour, is.rain) %>%
summarise(rate.sbi=mean(rate.sbi))
Dodge Plot
Hint: geom_bar(stat="identity", position="dodge")
ggplot(df2, aes(x=hour, y=rate.sbi, fill=is.rain)) +
geom_bar(stat="identity", position="dodge") +
labs(x="時間", y="有車率") +
thm() +
theme(legend.title=element_blank())
Facet panels
Hint: facet_grid(y~.)
or facet_grid(.~x)
ggplot(df2, aes(x=hour, y=rate.sbi, fill=is.rain)) +
geom_bar(stat="identity", position="dodge") +
labs(x="時間", y="有車率") +
thm() +
theme(legend.title=element_blank()) +
facet_grid(is.rain~.)
Facet panels
Hint: facet_grid(y~.)
or facet_grid(.~x)
ggplot(df2, aes(x=hour, y=rate.sbi, fill=is.rain)) +
geom_bar(stat="identity", position="dodge") +
labs(x="時間", y="有車率") +
thm() +
theme(legend.title=element_blank()) +
facet_grid(.~is.rain)
Pyramid
Hint: filter(df2, is.rain=="晴天")
, and coord_flip()
ggplot(df2, aes(x=hour,y=rate.sbi, fill=is.rain)) +
geom_bar(data=filter(df2, is.rain=="晴天"), stat="identity") +
geom_bar(aes(y=rate.sbi*(-1)), data=filter(df2, is.rain=="雨天"),
stat="identity") +
scale_y_continuous(breaks=seq(from=-1, to=1, by=0.1),
labels=abs(seq(-1, 1, 0.1))) +
labs(x="時間", y="有車率") +
theme(legend.title=element_blank()) +
coord_flip() + thm()
熱點圖
熱點圖 (heatmap) 是用顏色深淺呈現數值大小的視覺化。
Hint: geom_tile()
ggplot(df2, aes(x=hour, y=is.rain, fill=rate.sbi)) +
geom_tile() +
scale_fill_gradient(name="有車率", low="white", high="midnightblue") +
labs(x="時間", y="天氣") +
thm()
平行座標圖
平行座標圖 (Parallel coordinate plot) 多用於呈現多欄位的資料視覺化,強調欄位的順序性,特別適合用在因果關係的陳述。譬如:行業別 -> 是否上DSP課程 -> 職場表現。
Hint: library(GGally)
and ggparcoord()
library(GGally)
df2 <- mutate(df2, rain=as.numeric(is.rain)-1)
ggparcoord(data = iris, columns = 1:4, groupColumn = 5,
title = "Parallel Coordinate Plot for the Iris Data") + thm()
與鄰近場站的關係
tmp <- group_by(ubike, sno, sna, sarea, lat, lng) %>% distinct
dist <- round(distm(x=tmp[, c("lng","lat")])[,1])
df5 <- tmp %>% select(sno, sna, sarea, lat, lng) %>%
cbind(dist) %>% arrange(dist) %>% top_n(10, wt = -dist)
地圖應用範例
利用ggmap
套件導入google map作為底圖將場站位置標示出來。
Hint: library(ggmap)
, map <- get_map("Taipei"); ggmap(map)
, geom_point
library(ggmap)
df5$is.cityhall <- factor(c(1, rep(0, 9)), levels=1:0)
map <- get_map(location=c(lon=df5$lng[1], lat=df5$lat[1]) , zoom = 15)
ggmap(map) + thm() +
geom_point(data=df5, aes(x=lng, y=lat, colour=is.cityhall), size=5) +
geom_text(data=df5, aes(x=lng, y=lat, label=sna, colour=is.cityhall),
position="jitter", vjust=-1, hjust=0.5, size=4, family="STHeiti") +
theme(legend.position="none") + scale_color_brewer(palette="Set1")
熱點圖進階應用
tmp1 <- filter(ubike, sno%in%df5$sno) %>%
mutate(is.rain=rainfall>1) %>%
mutate(is.rain=factor(is.rain, levels=c(FALSE, TRUE),
labels = c("晴天","雨天"))) %>%
mutate(is.weekday=strftime(date, "%u")<6) %>%
mutate(is.weekday=factor(is.weekday, levels=c(FALSE, TRUE),
labels=c("平日","假日"))) %>%
mutate(is.rushhours=cut(hour, breaks=c(0, 4, 7, 24), right=FALSE)) %>%
group_by(date, sno, sna, is.weekday, is.rushhours, is.rain, hour, tot) %>%
summarise(rate.sbi=mean(avg.sbi)/tot, rate.used=mean(max.sbi-min.sbi)/tot)
df6 <- tmp1 %>%
filter(is.weekday=="平日", is.rain=="晴天") %>%
group_by(sno, sna, sna, hour) %>%
summarise(rate.sbi=mean(rate.sbi), rate.used=mean(rate.used))
heatmap 排序
df7 <- df6
df7$sna <- factor(df7$sna, levels=(sna.order[,2]))
ggplot(df7, aes(x=hour, y=sna, fill=rate.sbi)) + geom_tile() + thm() +
theme(legend.position="bottom") +
scale_fill_gradient(name="有車率", low="white", high="lawngreen") +
labs(x="時間", y="") +
theme(axis.text = element_text(size = 13, color="darkgreen"))
對時間做排序
hc.hour <- hclust(dist(t(dat)))
ggdendrogram(hc.hour) + thm() + labs(x="", y="")
對時間做排序
hour.order <- data.frame(order=1:24, sna=hc.hour$labels[hc.hour$order])
df7$hour <- factor(df7$hour, levels=(hour.order[,2]))
ggplot(df7, aes(x=hour, y=sna, fill=rate.sbi)) + geom_tile() + thm()+
theme(legend.position="bottom") +
scale_fill_gradient(name="有車率", low="white", high="lawngreen") +
labs(x="時間", y="") +
theme(axis.text = element_text(size = 13, color="darkgreen"))
試著對 使用率 進行排序
dat <- dcast(df6, sna~hour, value.var="rate.used")
rownames(dat) <- dat[,1]
dat <- dat[,-1]
hc.sna <- hclust(dist(dat))
hc.hour <- hclust(dist(t(dat)))
df8 <- df6
df8$sna <- factor(df8$sna, levels = hc.sna$labels[hc.sna$order])
df8$hour <- factor(df8$hour, levels = hc.hour$labels[hc.hour$order])
ggplot(df8, aes(x=hour, y=sna, fill=rate.used)) + geom_tile() + thm()+
theme(legend.position="bottom") +
scale_fill_gradient(name="使用率", low="white", high="Navy") +
labs(x="時間", y="") +
theme(axis.text = element_text(size = 13, color="darkblue"))
平行座標圖進階應用
tmp2 <- filter(tmp1, is.weekday=="平日", is.rain=="晴天", hour>6 & hour<22) %>%
group_by(sno, sna, tot) %>%
summarise(rate.sbi=mean(rate.sbi), rate.used=mean(rate.used))
km <- kmeans(tmp2[,3:5], 3)
km
K-means clustering with 3 clusters of sizes 4, 1, 5
Cluster means:
tot rate.sbi rate.used
1 65.0 0.3761128 0.1867222
2 180.0 0.2840504 0.1123704
3 35.2 0.2137039 0.2520515
Clustering vector:
[1] 2 3 1 1 1 1 3 3 3 3
Within cluster sum of squares by cluster:
[1] 300.00679 0.00000 92.84539
(between_SS / total_SS = 97.8 %)
Available components:
[1] "cluster" "centers" "totss" "withinss"
[5] "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
平行座標圖進階應用
df9 <- group_by(tmp2) %>%
transmute(sna, tot, rate.sbi, rate.used,
group=factor(km$cluster)) %>%
arrange(group)
ggparcoord(as.data.frame(df9), columns = c(1,2,3,4), groupColumn = 5,
scale="uniminmax") +
geom_line(size=1) + thm() + theme(legend.title=element_blank()) +
scale_x_discrete(labels=c("場站","總停車格","有車率","使用率")) +
labs(x="", y="")
ubike3<- filter(ubike,sarea=='中和區', hour==15) %>%
mutate(weekday=weekdays(as.Date(date))) %>%
filter(weekday=="周六"|weekday=="周日") %>%
group_by(sna) %>%
summarise(avg_wind=mean(max.anemo),avg_sbi=mean(avg.sbi),
avg_hu=mean(humidity),lng=unique(lng),lat=unique(lat))
# 讀取中和區的地圖,以中和區的第一筆資料的經緯度為中心
m <- get_map(location=c(lon=ubike3$lng[1], lat=ubike3$lat[1]),
maptype = "roadmap", zoom = 14)
# ggmap(m)畫出地圖,並以此為底圖,在地圖上以geom_point畫出圓圈
ggmap(m)+
geom_point(data=ubike3,
aes(x=lng, y=lat, size=avg_wind, alpha=avg_hu, color=avg_sbi))+
scale_size(range = c(5,20))+
scale_alpha(range = c(0.5,1))+
geom_text(data=ubike3,aes(x=lng,y=lat,label=sna),color='red',vjust=c(-1,1,1,0),
hjust=0,fontface=2,family = "STHeiti")+
theme(text=element_text(size=20))
參數解釋
aes(x=lng, y=lat, size=avg_wind, alpha=avg_hu, color=avg_sbi)
- X和Y為經緯度
- size以avg_wind為依據
- alpha以avg_hu為依據
- color以avg_sbi
- fill為填滿空間的顏色
- shape控制點的形狀
- 參數放在aes外的話,必須直接填入數值 (exp: size=5)
可能會用到的小撇步
nankang <- geocode('南港軟體園區', source = "google")
# nankang <- geocode(URLencode('南港軟體園區'), source = "google")
nan_map <- get_map(location=c(lon=nankang$lon,lat=nankang$lat),
zoom=15, maptype = 'roadmap', source = 'osm')
ggmap(nan_map)