课程报告复现第二期---地铁IC卡数据分析

地铁 IC 卡数据分析

数据预处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# 读取原始文件并转换编码
txt <- readLines("metro_ic_card.csv", encoding = "GBK")
writeLines(iconv(txt, from = "GBK", to = "UTF-8"), "metro_ic_cardUTF8.csv")

# 读取转换后的文件
data <- read.csv("metro_ic_cardUTF8.csv", encoding = "UTF-8")

# 加载 dplyr 库
library(dplyr)

# 重命名列
data <- rename(data, ID = 卡号, type = 交易类型, address = 交易地址, date = 交易日期,
time = 交易时间, terminal = 终端号, operater = 操作员, transmoney = 交易金额,
value = 交易值, huanchengamount = 换乘金额, transnum = 卡交易序号,
devicenum = 设备交易序号)

# 合并日期和时间
data$datetime <- as.POSIXct(paste(data$date, data$time), format = "%Y%m%d %H%M%S")
data$date <- NULL
data$time <- NULL

# 数据排序和过滤
data1 <- data %>% arrange(ID, type)
data1 <- data1 %>% filter(ID != lag(ID) | type != lag(type))

# 按地址分组
new_data <- data1 %>% group_by(address)
group_list <- new_data %>% group_by(address) %>% group_split()
group_count <- sapply(group_list, nrow)
sorted <- sort(group_count, decreasing = TRUE)

# 选择前三个最大的组
library(purrr)
target_length1 <- sorted[1]
target_group_list1 <- keep(group_list, ~length(.[[1]]) == target_length1)
target_length2 <- sorted[2]
target_group_list2 <- keep(group_list, ~length(.[[1]]) == target_length2)
target_length3 <- sorted[3]
target_group_list3 <- keep(group_list, ~length(.[[1]]) == target_length3)

# 提取地址值
col_values1 <- target_group_list1[[1]][["address"]]
col_values2 <- target_group_list2[[1]][["address"]]
col_values3 <- target_group_list3[[1]][["address"]]

# 筛选数据
address1 <- new_data %>% filter(address == col_values1[1])
address2 <- new_data %>% filter(address == col_values2[1])
address3 <- new_data %>% filter(address == col_values3[1])

# 对数据进行排序和分类
address1 <- address1 %>% arrange(datetime)
address1_in <- address1 %>% filter(type == 21)
address1_out <- address1 %>% filter(type == 22)

address2 <- address2 %>% arrange(datetime)
address2_in <- address2 %>% filter(type == 21)
address2_out <- address2 %>% filter(type == 22)

address3 <- address3 %>% arrange(datetime)
address3_in <- address3 %>% filter(type == 21)
address3_out <- address3 %>% filter(type == 22)

# 按5分钟间隔分组
address1_in$group <- cut(address1_in$datetime, breaks = "5 min")
address1_out$group <- cut(address1_out$datetime, breaks = "5 min")
address2_in$group <- cut(address2_in$datetime, breaks = "5 min")
address2_out$group <- cut(address2_out$datetime, breaks = "5 min")
address3_in$group <- cut(address3_in$datetime, breaks = "5 min")
address3_out$group <- cut(address3_out$datetime, breaks = "5 min")

# 创建时间序列
time_zone <- seq(5, 60, by = 5)

# 计算每5分钟的进出站人数
counts1in <- table(address1_in$group)
counts_address1_in <- data.frame(time = time_zone, count = counts1in)
counts2in <- table(address2_in$group)
counts_address2_in <- data.frame(time = time_zone, count = counts2in)
counts3in <- table(address3_in$group)
counts_address3_in <- data.frame(time = time_zone, count = counts3in)

counts1out <- table(address1_out$group)
counts_address1_out <- data.frame(time = time_zone, count = counts1out)
counts2out <- table(address2_out$group)
counts_address2_out <- data.frame(time = time_zone, count = counts2out)
counts3out <- table(address3_out$group)
counts_address3_out <- data.frame(time = time_zone, count = counts3out)

数据可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
library(ggplot2)

# 绘制进站客流图
x <- counts_address1_in$time
y1 <- counts_address1_in$count.Freq
y2 <- counts_address2_in$count.Freq
y3 <- counts_address3_in$count.Freq

plot(x, y1, type="b", pch = 25, bg="red", col="red", ylim=c(0,600),
xlab="time", ylab="number", main="进站客流")
axis(1, at = x)
lines(x, y2, type="b", pch = 21, bg="blue", col="blue")
lines(x, y3, type="b", pch = 15, col="green")
legend("topright", legend=c(col_values1[1], col_values2[1], col_values3[1]),
col=c("red", "blue", "green"), lty=1, cex=0.8)

# 绘制出站客流图
xout <- counts_address1_out$time
y1out <- counts_address1_out$count.Freq
y2out <- counts_address2_out$count.Freq
y3out <- counts_address3_out$count.Freq

plot(xout, y1out, type="b", pch = 25, bg="red", col="red", ylim=c(100,800),
xlab="time", ylab="number", main="出站客流")
axis(1, at = xout)
lines(xout, y2out, type="b", pch = 21, bg="blue", col="blue")
lines(xout, y3out, type="b", pch = 15, col="green")
legend("topright", legend=c(col_values1[1], col_values2[1], col_values3[1]),
col=c("red", "blue", "green"), lty=1, cex=0.8)

OD 矩阵分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# 筛选特定地址的数据
datasub <- list()
address_list <- c(1268001000, 1268002000, 1268003000, 1268004000, 1268005000,
1268006000, 1268007000, 1268008000, 1268009000, 1268011000)
for (i in seq_along(address_list)) {
datasub[[i]] <- filter(data1, address == address_list[i])
}
datasub_all <- bind_rows(datasub[1], datasub[2], datasub[3], datasub[4], datasub[5],
datasub[6], datasub[7], datasub[8], datasub[9], datasub[10])

# 分离进出站数据
datasub_all_in <- filter(datasub_all, type == 21)
datasub_all_out <- filter(datasub_all, type == 22)

# 合并进出站数据
datasub_bind <- inner_join(datasub_all_in, datasub_all_out, by = "ID")
datasub_bind$timedf <- datasub_bind$datetime.y - datasub_bind$datetime.x
datasub_bind <- filter(datasub_bind, !datasub_bind$timedf < 0)

# 选择需要的列
datasub_new <- select(datasub_bind, ID, datetime.x, datetime.y, type.x, type.y, address.x, address.y)
datasub_new_clean <- datasub_new[!datasub_new$address.x == datasub_new$address.y, ]

# 计算各站点间的 OD 矩阵
# (此处省略了大量重复代码,仅展示一个示例)
count_in_1268001000 <- datasub_new_clean %>%
group_by(address.x) %>%
filter(address.y == "1268001000") %>%
summarize(count = n()) %>%
pull(count)

# 创建 OD 矩阵
data_OD <- data.frame(matrix(ncol = 0, nrow = 10))
data_OD$ori <- address_list

# 填充 OD 矩阵
# (此处省略了大量重复代码,仅展示一个示例)
data_OD$dre_1268001000 <- count_in_1268001000

# 计算特定 OD 对的平均时间
datasub_new_OD1 <- filter(datasub_new_clean, address.x == 1268001000, address.y == 1268002000)
datasub_new_OD2 <- filter(datasub_new_clean, address.x == 1268002000, address.y == 1268001000)
datasub_new_OD <- bind_rows(datasub_new_OD2, datasub_new_OD1)
datasub_new_OD$datetime_df <- datasub_new_OD$datetime.y - datasub_new_OD$datetime.x
mean(datasub_new_OD$datetime_df)

部分图片展示

图1
图2