2020-08-16

WARN scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, ninjutsuH5, executor 1): org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:270)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:189)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$write$1$$anonfun$apply$mcV$sp$1.apply(FileFormatWriter.scala:188)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:344)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.RuntimeException: Error while encoding: java.lang.ArrayIndexOutOfBoundsException: 0
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 0, rksj), StringType), true) AS rksj#4
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 1, imei), StringType), true) AS imei#5
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 2, imsi), StringType), true) AS imsi#6
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 3, longitude), StringType), true) AS longitude#7
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 4, latitude), StringType), true) AS latitude#8
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 5, phone_mac), StringType), true) AS phone_mac#9
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 6, device_mac), StringType), true) AS device_mac#10
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 7, device_number), StringType), true) AS device_number#11
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 8, collect_time), StringType), true) AS collect_time#12
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 9, username), StringType), true) AS username#13
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 10, phone), StringType), true) AS phone#14
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 11, object_username), StringType), true) AS object_username#15
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 12, send_message), StringType), true) AS send_message#16
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 13, accept_message), StringType), true) AS accept_message#17
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 14, message_time), StringType), true) AS message_time#18
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 15, id), StringType), true) AS id#19
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 16, table1), StringType), true) AS table1#20
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 17, filename), StringType), true) AS filename#21
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 18, absolute_filename), StringType), true) AS absolute_filename#22
	at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:290)
	at org.apache.spark.sql.SparkSession$$anonfun$3.apply(SparkSession.scala:582)
	at org.apache.spark.sql.SparkSession$$anonfun$3.apply(SparkSession.scala:582)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$SingleDirectoryWriteTask.execute(FileFormatWriter.scala:324)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:256)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$$anonfun$org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask$3.apply(FileFormatWriter.scala:254)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1375)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.org$apache$spark$sql$execution$datasources$FileFormatWriter$$executeTask(FileFormatWriter.scala:259)
	... 8 more
Caused by: java.lang.ArrayIndexOutOfBoundsException: 0
	at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:173)
	at org.apache.spark.sql.Row$class.isNullAt(Row.scala:191)
	at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:165)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.evalIfCondExpr$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:287)
	... 17 more

找到 ExpressionEncoder.scala 的290行,可以发现其实是序列化错误

于是这时要去检查

dataFrame里面的字段和hive里面的字段不匹配,需认真检查一下字段

或和Schema ,case class的描述的字段不一致

感谢你提供的列名信息,我们可以看到: - `"...1"`:可能是 Excel 中第一列无标题的自动命名(比如行号或空列),可以忽略。 - `"STATION"`:站点编号 - `"NAME"`:站点名称 - `"LATITUDE"`:纬度 - `"LONGITUDE"`:经度 - `"ELEVATION"`:海拔 - `"statement"`:可能是一个状态或注释字段 - 后续从 `"2020-01-01"` 到 `"2020-01-31"` 是每日风速数据(共31天) --- ### ✅ 目标更新: 你要做的是:**根据给定的 SHP 范围裁剪站点,保留落在该地理范围内的所有站点及其完整的逐日风速数据。** 下面是适配你实际列名的完整 R 语言代码,并处理好坐标、投影和数据结构问题。 ```r # 加载所需包 library(sf) library(readxl) library(dplyr) # ------------------- 参数设置 ------------------- excel_file <- "your_wind_station_data.xlsx" # 替换为你的实际文件路径 sheet_name <- "Sheet1" # 替换为你的工作表名 shp_file <- "your_boundary.shp" # 替换为你的SHP文件路径 # ------------------- 步骤1: 读取Excel数据 ------------------- df <- read_excel(excel_file, sheet = sheet_name) # 查看列名确认 names(df) <- make.names(names(df)) # 确保列名是合法的(防止空格等问题) cat("原始列名:\n"); print(names(df)) # ------------------- 步骤2: 提取空间信息并创建sf对象 ------------------- # 使用 LATITUDE 和 LONGITUDE 创建空间点(注意:顺序是 LON, LAT) stations_sf <- st_as_sf(df, coords = c("LONGITUDE", "LATITUDE"), # 经度在前,纬度在后 crs = 4326, # WGS84 地理坐标系 dim = "XY") # ------------------- 步骤3: 读取SHP边界并确保其CRS ------------------- boundary <- st_read(shp_file) # 如果boundary不是投影坐标系(如EPSG:32649),则需要检查并转换 # 假设你知道目标投影是 UTM Zone 49N (EPSG:32649),我们统一到这个坐标系进行空间操作 if (is.na(st_crs(boundary))) { stop("SHP文件没有坐标系信息,请先定义正确的CRS!") } # 将站点数据重投影到与SHP相同的坐标系下进行空间判断 stations_projected <- st_transform(stations_sf, crs = st_crs(boundary)) # ------------------- 步骤4: 空间裁剪 —— 找出在SHP范围内的站点 ------------------- # 使用 st_intersects 或 st_within 进行空间子集提取 # 这里使用 [s,t] 语法:返回落在任意多边形内的站点 stations_clipped <- stations_projected[boundary, , op = st_intersects] # 若你想更严格地要求“完全包含”,可用 st_within,但通常 st_intersects 更通用 # ------------------- 步骤5: 转回原始经纬度并提取属性表格 ------------------- # 将结果转回WGS84以便保留原始经纬度格式输出 stations_wgs84 <- st_transform(stations_clipped, crs = 4326) # 去掉geometry列,恢复为普通data.frame,同时保留原始所有列(包括每日数据) result_df <- st_drop_geometry(stations_wgs84) # 可选:重新排序列,把时间序列放在后面 date_cols <- grep("^\\d{4}-\\d{2}-\\d{2}", names(result_df), value = TRUE) non_date_cols <- setdiff(names(result_df), date_cols) final_df <- select(result_df, c(non_date_cols, date_cols)) # 按逻辑排序 # ------------------- 步骤6: 导出结果 ------------------- write.csv(final_df, "clipped_stations_202001.csv", row.names = FALSE, na = "") cat("共保留了", nrow(final_df), "个站点在SHP范围内。\n") ``` --- ### ✅ 关键说明: | 功能 | 说明 | |------|------| | `coords = c("LONGITUDE", "LATITUDE")` | 必须是 **经度在前,纬度在后**,否则位置错误! | | `crs = 4326` | 表示输入的经纬度使用 WGS84 坐标系 | | `st_transform(...)` | 将点从地理坐标(度)转为投影坐标(米),确保与 SHP 在同一空间参考下比较 | | `st_intersects` | 判断点是否与多边形相交(即落在内部),适用于大多数情况 | | `st_drop_geometry()` | 移除空间结构,得到纯数据框用于导出 | > 💡 输出的 CSV 文件将包含原始所有列,包括 `STATION`, `NAME`, `LATITUDE`, `LONGITUDE`, `ELEVATION`, `statement` 和所有日期列(如 `2020-01-01` 等),仅保留位于 SHP 范围内的站点。 --- ### ✅ 示例输出片段(final_df 头几行): ``` ...1 STATION NAME LATITUDE LONGITUDE ELEVATION statement 2020-01-01 2020-01-02 ... 1 1 101 Beijing 39.90 116.4 50.0 good 3.2 4.1 2 2 102 Tianjin 39.08 117.2 10.0 good 5.0 3.8 ... ``` --- ###
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值