本week主要为两case
其一为聚类
其二为空气污染案例
数据:2012和1999年pm2.5的值,以观察两者之间有否不同,从而论证近年来空气洁净计划的成果
step 1:先大概瞄一眼数据
因此决定读取数据方法为
pm0<-
read.table("RD_501_88101_1999-0.txt",
comment.char ="#",
header =FALSE,
sep ="|",
na.strings ="")
head(pm0)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
1 RD I 1 27 1 88101 1 7 105 120 19990103 00:00 NA AS
2 RD I 1 27 1 88101 1 7 105 120 19990106 00:00 NA AS
3 RD I 1 27 1 88101 1 7 105 120 19990109 00:00 NA AS
4 RD I 1 27 1 88101 1 7 105 120 19990112 00:00 8.841
5 RD I 1 27 1 88101 1 7 105 120 19990115 00:00 14.920
6 RD I 1 27 1 88101 1 7 105 120 19990118 00:00 3.878
V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28
1 3 NA NA NA NA NA NA NA NA NA NA NA NA
2 3 NA NA NA NA NA NA NA NA NA NA NA NA
3 3 NA NA NA NA NA NA NA NA NA NA NA NA
4 3 NA NA NA NA NA NA NA NA NA NA NA NA
5 3 NA NA NA NA NA NA NA NA NA NA NA NA
6 3 NA NA NA NA NA NA NA NA NA NA NA NA
> cnames <- readLines("RD_501_88101_1999-0.txt", 1)
> cnames
[1] "# RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier
- 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty"
cnames<-strsplit(cnames,"|",
fixed =TRUE)
此处fixed=T表示准确匹配
cnames<-strsplit(cnames,"|",fixed=T)
> cnames
[[1]]
[1] "# RD"
[2] "Action Code"
[3] "State Code"
[4] "County Code"
[5] "Site ID"
[6] "Parameter"
[7] "POC"
[8] "Sample Duration"
[9] "Unit"
[10] "Method"
[11] "Date"
[12] "Start Time"
[13] "Sample Value"
[14] "Null Data Code"
[15] "Sampling Frequency"
[16] "Monitor Protocol (MP) ID"
[17] "Qualifier - 1"
[18] "Qualifier - 2"
[19] "Qualifier - 3"
[20] "Qualifier - 4"
[21] "Qualifier - 5"
[22] "Qualifier - 6"
[23] "Qualifier - 7"
[24] "Qualifier - 8"
[25] "Qualifier - 9"
[26] "Qualifier - 10"
[27] "Alternate Method Detectable Limit"
[28] "Uncertainty"
names(pm0)<-make.names(cnames[[1]])
将名字中间的空格去掉,换之以.以符合名字的命名规则
head(pm0)
X..RD Action.Code State.Code County.Code Site.ID Parameter
1 RD I 1 27 1 88101
2 RD I 1 27 1 88101
3 RD I 1 27 1 88101
4 RD I 1 27 1 88101
5 RD I 1 27 1 88101
6 RD I 1 27 1 88101
POC Sample.Duration Unit Method Date Start.Time
1 1 7 105 120 19990103 00:00
2 1 7 105 120 19990106 00:00
3 1 7 105 120 19990109 00:00
4 1 7 105 120 19990112 00:00
5 1 7 105 120 19990115 00:00
6 1 7 105 120 19990118 00:00
Sample.Value Null.Data.Code Sampling.Frequency
1 NA AS 3
2 NA AS 3
3 NA AS 3
4 8.841 3
5 14.920 3
6 3.878 3
Monitor.Protocol..MP..ID Qualifier...1 Qualifier...2
1 NA NA
2 NA NA
3 NA NA
4 NA NA
5 NA NA
6 NA NA
Qualifier...3 Qualifier...4 Qualifier...5 Qualifier...6
1 NA NA NA NA
2 NA NA NA NA
3 NA NA NA NA
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
Qualifier...7 Qualifier...8 Qualifier...9 Qualifier...10
1 NA NA NA NA
2 NA NA NA NA
3 NA NA NA NA
4 NA NA NA NA
5 NA NA NA NA
6 NA NA NA NA
Alternate.Method.Detectable.Limit Uncertainty
1 NA NA
2 NA NA
3 NA NA
4 NA NA
5 NA NA
6 NA NA
x0
<- pm0$Sample.Value
> class(x0)
[1] "numeric"
> str(x0)
num [1:117421] NA NA NA 8.84 14.92 ...
> summary(x0)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 7.20 11.50 13.74 17.90 157.10 13217
> sum(is.na(x0))
[1] 13217
> mean(is.na(x0))
[1] 0.1125608
可见出现了miss data
summary(x1)pm1 <- read.table("RD_501_88101_2012-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "", nrow = 1304290)names(pm1) <- make.names(cnames[[1]])head(pm1)dim(pm1)x1 <- pm1$Sample.Valueclass(x1)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-10.00 4.10 8.00 9.79 13.00 985.00 78313
> summary(x0)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 7.20 11.50 13.74 17.90 157.10 13217
mean(is.na(x1))##
Are missing values important here?
[1] 0.06004263
boxplot(x1,x2)
boxplot(log10(x0),
log10(x1))
Warning
messages:
1: In boxplot.default(log10(x0), log10(x1)) : NaNs produced
2: In bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group == :
Outlier (-Inf) in boxplot 1 is not drawn
3: In bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z$group == :
Outlier (-Inf) in boxplot 2 is not drawn
产生了一堆warning,原因在于原始数据中存在负值
pm2.5为负值,想想有些小奇怪
共有24463个负值,平均为0.01995388negative <- x1 < 0sum(negative, na.rm = T)mean(negative, na.rm = T)
来瞄一下负值出现的原因
从图中可得出一些结论dates <- pm1$Datestr(dates)dates <- as.Date(as.character(dates), "%Y%m%d")str(dates)hist(dates, "month") ## Check what's going on in months 1--6
好似冬天的miss要多于夏天的
好吧,老师说与其瞄全国的中位数,何不就锁定某个位置的某一个监控器呢,或者锁定某个州呢
## Find a monitor for New York State that exists in both datasetssite0 <- unique(subset(pm0, State.Code == 36, c(County.Code, Site.ID)))site1 <- unique(subset(pm1, State.Code == 36, c(County.Code, Site.ID)))site0 <- paste(site0[,1], site0[,2], sep = ".")site1 <- paste(site1[,1], site1[,2], sep = ".")str(site0)str(site1)both <- intersect(site0, site1)print(both)
## Find how many observations available at each monitorpm0$county.site <- with(pm0, paste(County.Code, Site.ID, sep = "."))pm1$county.site <- with(pm1, paste(County.Code, Site.ID, sep = "."))cnt0 <- subset(pm0, State.Code == 36 & county.site %in% both)cnt1 <- subset(pm1, State.Code == 36 & county.site %in% both)sapply(split(cnt0, cnt0$county.site), nrow)sapply(split(cnt1, cnt1$county.site), nrow)## Choose county 63 and side ID 2008pm1sub <- subset(pm1, State.Code == 36 & County.Code == 63 & Site.ID == 2008)pm0sub <- subset(pm0, State.Code == 36 & County.Code == 63 & Site.ID == 2008)dim(pm1sub)dim(pm0sub)## Plot data for 2012dates1 <- pm1sub$Datex1sub <- pm1sub$Sample.Valueplot(dates1, x1sub)dates1 <- as.Date(as.character(dates1), "%Y%m%d")str(dates1)plot(dates1, x1sub)## Plot data for 1999dates0 <- pm0sub$Datedates0 <- as.Date(as.character(dates0), "%Y%m%d")x0sub <- pm0sub$Sample.Valueplot(dates0, x0sub)## Plot data for both years in same panelpar(mfrow = c(1, 2), mar = c(4, 4, 2, 1))plot(dates0, x0sub, pch = 20)abline(h = median(x0sub, na.rm = T))plot(dates1, x1sub, pch = 20) ## Whoa! Different rangesabline(h = median(x1sub, na.rm = T))## Find global rangerng <- range(x0sub, x1sub, na.rm = T)rngpar(mfrow = c(1, 2), mar = c(4, 4, 2, 1))plot(dates0, x0sub, pch = 20, ylim = rng)abline(h = median(x0sub, na.rm = T))plot(dates1, x1sub, pch = 20, ylim = rng)abline(h = median(x1sub, na.rm = T))## Show state-wide means and make a plot showing trendhead(pm0)mn0 <- with(pm0, tapply(Sample.Value, State.Code, mean, na.rm = T))str(mn0)summary(mn0)mn1 <- with(pm1, tapply(Sample.Value, State.Code, mean, na.rm = T))str(mn1)## Make separate data frames for states / yearsd0 <- data.frame(state = names(mn0), mean = mn0)d1 <- data.frame(state = names(mn1), mean = mn1)mrg <- merge(d0, d1, by = "state")dim(mrg)head(mrg)## Connect linespar(mfrow = c(1, 1))with(mrg, plot(rep(1, 52), mrg[, 2], xlim = c(.5, 2.5)))with(mrg, points(rep(2, 52), mrg[, 3]))segments(rep(1, 52), mrg[, 2], rep(2, 52), mrg[, 3])