[root@hadoop1 ~]# spark-submit --conf "spark.pyspark.python=$(which python3)" --conf "spark.pyspark.driver.python=$(which python3)" /taxi/code/predict.py
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-ef4bfb75-a547-4bb0-84e4-196117b4d2b1
java.nio.file.FileSystemException: /tmp/spark-ef4bfb75-a547-4bb0-84e4-196117b4d2b1: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-df39ab0d-60d0-43f7-9561-fe1a73ccf2cc
java.nio.file.FileSystemException: /tmp/spark-df39ab0d-60d0-43f7-9561-fe1a73ccf2cc: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-59694199-b41e-4011-8c2e-46c61e5f0404
java.nio.file.FileSystemException: /tmp/spark-59694199-b41e-4011-8c2e-46c61e5f0404: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-0be733de-0b97-4287-ae22-937114a70722
java.nio.file.FileSystemException: /tmp/spark-0be733de-0b97-4287-ae22-937114a70722: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-daa19de8-72b8-43bd-8a8a-dbfb8e904ed2
java.nio.file.FileSystemException: /tmp/spark-daa19de8-72b8-43bd-8a8a-dbfb8e904ed2: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-08f02114-7323-4111-873b-a79b7b92f245
java.nio.file.FileSystemException: /tmp/spark-08f02114-7323-4111-873b-a79b7b92f245: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-64d36f85-d4f3-4f35-afaa-e1f011f7eefc
java.nio.file.FileSystemException: /tmp/spark-64d36f85-d4f3-4f35-afaa-e1f011f7eefc: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-7ff0593e-064a-4351-8e9e-a14282001f78
java.nio.file.FileSystemException: /tmp/spark-7ff0593e-064a-4351-8e9e-a14282001f78: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-58a1e072-e880-44eb-92d8-a5d311bc1b58
java.nio.file.FileSystemException: /tmp/spark-58a1e072-e880-44eb-92d8-a5d311bc1b58: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
25/07/07 18:52:00 ERROR Utils: Failed to create directory /tmp/spark-1dcfb61c-2a8b-4a8e-8535-5972dc5b2a14
java.nio.file.FileSystemException: /tmp/spark-1dcfb61c-2a8b-4a8e-8535-5972dc5b2a14: 设备上没有空间
at sun.nio.fs.UnixException.translateToIOException(UnixException.java:91)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:102)
at sun.nio.fs.UnixException.rethrowAsIOException(UnixException.java:107)
at sun.nio.fs.UnixFileSystemProvider.createDirectory(UnixFileSystemProvider.java:384)
at java.nio.file.Files.createDirectory(Files.java:674)
at java.nio.file.Files.createAndCheckIsDirectory(Files.java:781)
at java.nio.file.Files.createDirectories(Files.java:767)
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:324)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Exception in thread "main" java.io.IOException: Failed to create a temp directory (under /tmp) after 10 attempts!
at org.apache.spark.util.Utils$.createDirectory(Utils.scala:318)
at org.apache.spark.util.Utils$.createTempDir(Utils.scala:342)
at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:344)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:901)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1046)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1055)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
[root@hadoop1 ~]## coding=gbk
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.regression import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline
from pyspark import StorageLevel
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import traceback
import os
import sys
import smtplib
import shutil
import glob
from email.mime.text import MIMEText
from datetime import datetime, date
from math import radians, sin, cos, sqrt, atan2
import holidays
# ======================== 配置集中管理 ========================
class Config:
BASE_PATH = "/taxi"
LOG_DIR = f"{BASE_PATH}/logs"
FEATURE_COLS = [
"passenger_count", "trip_distance", "haversine_distance", "manhattan_distance",
"pickup_hour", "pickup_weekday", "pickup_month", "is_weekend", "is_holiday",
"VendorID_index", "RatecodeID_index", "payment_type_index", "time_period_index",
"location_traffic"
]
# ======================== 初始化SparkSession (性能增强) ========================
print("[初始化] 正在创建Spark会话...")
spark = SparkSession.builder \
.appName("NYCTaxiTripDurationPrediction") \
.config("spark.sql.warehouse.dir", "file:///taxi/tmp/spark-warehouse") \
.config("spark.driver.memory", "4g") \
.config("spark.executor.memory", "4g") \
.config("spark.executor.instances", "4") \
.config("spark.executor.cores", "2") \
.config("spark.sql.shuffle.partitions", "16") \
.config("spark.local.dir", "/data/tmp") \
.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
.config("spark.dynamicAllocation.enabled", "true") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.getOrCreate()
print("[初始化] Spark会话创建成功")
# ======================== 数据schema定义 ========================
print("[数据准备] 定义数据schema...")
schema = StructType([
StructField("VendorID", IntegerType(), True),
StructField("tpep_pickup_datetime", StringType(), True),
StructField("tpep_dropoff_datetime", StringType(), True),
StructField("passenger_count", IntegerType(), True),
StructField("trip_distance", DoubleType(), True),
StructField("PULocationID", IntegerType(), True),
StructField("DOLocationID", IntegerType(), True),
StructField("RatecodeID", IntegerType(), True),
StructField("payment_type", IntegerType(), True),
StructField("fare_amount", DoubleType(), True),
StructField("tip_amount", DoubleType(), True),
StructField("total_amount", DoubleType(), True)
])
print("[数据准备] Schema定义完成")
# ======================== 位置数据处理 ========================
print("[特征工程] 加载位置坐标数据...")
def load_location_data():
"""加载纽约出租车区域坐标数据,包含完整位置映射"""
return {
1: (40.713, -74.006), # Manhattan
2: (40.750, -73.937), # Queens
3: (40.678, -73.944), # Brooklyn
4: (40.650, -73.949), # Bronx
5: (40.579, -74.152), # Staten Island
79: (40.641, -73.778), # JFK Airport
186: (40.759, -73.984), # Times Square
140: (40.714, -74.006), # Financial District
236: (40.761, -73.977), # Midtown East
211: (40.721, -74.001), # SoHo
148: (40.758, -73.985), # Theater District
141: (40.752, -73.978), # Grand Central
138: (40.751, -73.994), # Penn Station
181: (40.743, -74.008), # Chelsea
161: (40.776, -73.982), # Upper East Side
261: (40.774, -73.954), # Upper East Side
113: (40.734, -73.989), # Greenwich Village
107: (40.728, -74.008), # Tribeca
137: (40.746, -74.004), # Flatiron District
158: (40.748, -73.988), # Koreatown
246: (40.753, -73.984), # Herald Square
190: (40.764, -73.973), # Turtle Bay
68: (40.742, -73.988), # NoMad
90: (40.749, -73.992), # Garment District
132: (40.715, -74.013), # Battery Park City
216: (40.703, -74.013), # World Trade Center
164: (40.763, -73.982), # Columbus Circle
237: (40.770, -73.982), # Lenox Hill
263: (40.768, -73.962), # Yorkville
# 默认位置(纽约市中心)
-1: (40.7128, -74.0060)
}
print("[特征工程] 位置坐标数据加载完成")
# ======================== 假期日历初始化 ========================
print("[特征工程] 初始化美国假期日历...")
us_holidays = holidays.US(years=[2024])
print(f"[特征工程] 已加载 {len(us_holidays)} 个假期")
# ======================== 增强的错误处理 (修复版) ========================
def log_error(error_msg, dataset=None):
"""修复错误处理逻辑并添加安全防护"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
error_type = "CRITICAL"
# 1. 本地日志记录
os.makedirs(Config.LOG_DIR, exist_ok=True)
log_path = f"{Config.LOG_DIR}/error_{timestamp}.log"
full_error = f"[{error_type}] {timestamp}\nError: {error_msg}\n{traceback.format_exc()}"
with open(log_path, "w") as f:
f.write(full_error)
print(f"[错误处理] 错误日志已保存至: {log_path}")
# 2. 邮件报警 (带错误级别)
try:
msg = MIMEText(f"[{error_type}] 程序异常:\n{full_error[:2000]}...")
msg['Subject'] = f"[{error_type}] NYC Taxi Prediction Error - {timestamp}"
msg['From'] = "alert@taxi-system.com"
msg['To'] = "admin@taxi-system.com"
# 使用更安全的邮件发送方式
with smtplib.SMTP('localhost', 25) as server: # 使用本地邮件服务
server.send_message(msg)
print(f"[错误处理] {error_type}级报警邮件已发送")
except Exception as email_err:
print(f"[错误处理] 邮件发送失败: {str(email_err)}")
except Exception as e:
print(f"[错误处理] 错误处理过程中发生异常: {str(e)}")
finally:
# 安全释放资源
try:
if spark:
spark.sparkContext.stop()
print("[错误处理] 释放Spark资源")
except:
pass
# ======================== 高效地理计算函数 ========================
def haversine_distance(lat1, lon1, lat2, lon2):
"""向量化Haversine距离计算(公里)"""
R = 6371 # 地球半径(km)
lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
return R * c
# 注册Pandas UDF实现向量化计算
print("[特征工程] 注册向量化Haversine距离UDF...")
@udf(returnType=DoubleType())
def vectorized_haversine(pu_lat, pu_lon, do_lat, do_lon):
"""向量化Haversine距离计算(公里)"""
if any(pd.isnull([pu_lat, pu_lon, do_lat, do_lon])):
return None
return haversine_distance(pu_lat, pu_lon, do_lat, do_lon)
print("[特征工程] UDF注册完成")
# ======================== 特征工程函数 (性能优化) ========================
def add_temporal_features(df):
"""时间特征优化:用整数编码替代字符串"""
print("[特征工程] 添加时间特征...")
df = df.withColumn("pickup_ts", to_timestamp(col("tpep_pickup_datetime"), "yyyy-MM-dd HH:mm:ss")) \
.withColumn("dropoff_ts", to_timestamp(col("tpep_dropoff_datetime"), "yyyy-MM-dd HH:mm:ss")) \
.withColumn("trip_duration", (unix_timestamp(col("dropoff_ts")) - unix_timestamp(col("pickup_ts"))) / 60)
# 数值型时间特征
df = df.withColumn("pickup_hour", hour(col("pickup_ts"))) \
.withColumn("pickup_weekday", dayofweek(col("pickup_ts"))) \
.withColumn("pickup_month", month(col("pickup_ts"))) \
.withColumn("is_weekend", when(col("pickup_weekday").isin(1,7), 1).otherwise(0)) \
.withColumn("is_holiday", when(to_date(col("pickup_ts")).isin(list(us_holidays)), 1).otherwise(0))
# 时段分类改用整数编码 (0-3)
time_period_expr = expr("""
CASE WHEN pickup_hour BETWEEN 6 AND 9 THEN 0
WHEN pickup_hour BETWEEN 16 AND 19 THEN 1
WHEN pickup_hour BETWEEN 0 AND 5 THEN 2
ELSE 3 END""")
df = df.withColumn("time_period", time_period_expr)
return df
def add_spatial_features(df):
"""空间特征:减少UDF依赖"""
print("[特征工程] 添加空间特征...")
location_data = load_location_data()
location_bc = spark.sparkContext.broadcast(location_data)
# 优化:避免UDF链式调用
@udf(returnType=StructType([
StructField("pu_lat", DoubleType()),
StructField("pu_lon", DoubleType()),
StructField("do_lat", DoubleType()),
StructField("do_lon", DoubleType())
]))
def get_coords(pu_id, do_id):
pu_coord = location_bc.value.get(int(pu_id), location_bc.value[-1])
do_coord = location_bc.value.get(int(do_id), location_bc.value[-1])
return (pu_coord[0], pu_coord[1], do_coord[0], do_coord[1])
# 单次UDF调用获取所有坐标
df = df.withColumn("coords", get_coords(col("PULocationID"), col("DOLocationID"))) \
.select("*",
col("coords.pu_lat"),
col("coords.pu_lon"),
col("coords.do_lat"),
col("coords.do_lon"))
# 使用Spark内置函数计算距离
df = df.withColumn("haversine_distance",
expr("6371 * 2 * ASIN(SQRT(POWER(SIN(RADIANS(do_lat - pu_lat)/2),2) + COS(RADIANS(pu_lat)) * COS(RADIANS(do_lat)) * POWER(SIN(RADIANS(do_lon - pu_lon)/2),2))")) \
.withColumn("manhattan_distance",
abs(col("pu_lat") - col("do_lat")) + abs(col("pu_lon") - col("do_lon")))
return df.drop("coords")
# ======================== 主程序 (资源管理强化) ========================
def main():
# 初始化缓存变量
train_data = None
test_data = None
sampled_df = None
try:
# ======================== 数据加载 ========================
print("[数据加载] 开始从HDFS加载数据...")
data_path = "hdfs:///user/root/taxi/data/cleaned/cleaned_yellow_tripdata_2024-*.csv"
print(f"[数据加载] 数据路径: {data_path}")
df = spark.read.csv(data_path, schema=schema, header=True)
print(f"[数据加载] 原始数据行数: {df.count()}")
# 固定采样比例
sample_fraction = 0.001
print(f"[资源调整] 采样比例设置为: {sample_fraction*100}%")
sampled_df = df.sample(False, sample_fraction, seed=42)
print(f"[数据加载] 采样后数据行数: {sampled_df.count()}")
# 分级缓存策略
print("[数据处理] 缓存采样数据...")
sampled_df.persist(StorageLevel.MEMORY_AND_DISK)
sampled_df.count()
print("[数据处理] 数据缓存完成")
# ======================== 特征工程流水线 ========================
print("[特征工程] 开始特征处理...")
df = sampled_df.transform(add_temporal_features)
# 异常值过滤
print("[数据处理] 过滤异常值...")
initial_count = df.count()
df = df.filter((col("trip_duration") > 0.5) & (col("trip_duration") < 180)) \
.filter(col("trip_distance") > 0.01) \
.filter((col("passenger_count") > 0) & (col("passenger_count") < 7)) \
.filter(col("fare_amount") > 2.5) \
.filter(col("RatecodeID").isin([1, 2, 3, 4, 5, 6]))
filtered_count = df.count()
print(f"[数据处理] 异常值过滤完成, 过滤掉 {initial_count - filtered_count} 条记录, 剩余 {filtered_count} 条")
# 空间特征处理
df = df.transform(add_spatial_features)
# 释放采样数据缓存
print("[资源管理] 释放采样数据缓存...")
sampled_df.unpersist()
sampled_df = None
print("[资源管理] 缓存已释放")
# ======================== 特征编码 ========================
print("[特征工程] 开始特征编码...")
indexers = [
StringIndexer(inputCol="VendorID", outputCol="VendorID_index", handleInvalid="keep"),
StringIndexer(inputCol="RatecodeID", outputCol="RatecodeID_index", handleInvalid="keep"),
StringIndexer(inputCol="payment_type", outputCol="payment_type_index", handleInvalid="keep"),
StringIndexer(inputCol="time_period", outputCol="time_period_index", handleInvalid="keep")
]
assembler = VectorAssembler(
inputCols=Config.FEATURE_COLS,
outputCol="features_raw"
)
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)
# ======================== 数据集划分 ========================
print("[数据分割] 划分训练集和测试集...")
max_date = df.agg(max("pickup_ts")).first()[0]
print(f"[数据分割] 最新日期: {max_date}")
split_date = max_date - expr("INTERVAL 14 DAYS")
print(f"[数据分割] 分割日期: {split_date}")
train_data = df.filter(col("pickup_ts") < split_date).cache()
test_data = df.filter(col("pickup_ts") >= split_date).cache()
train_count = train_data.count()
test_count = test_data.count()
print(f"[数据分割] 训练集行数: {train_count}, 测试集行数: {test_count}")
print(f"[数据分割] 训练/测试比例: {train_count/(train_count+test_count):.2%}/{test_count/(train_count+test_count):.2%}")
# ======================== 模型训练 (增量训练支持) ========================
model_path = "hdfs:///models/nyc_taxi_trip_duration_model"
best_model = None
if os.path.exists(model_path):
print(f"[模型加载] 检测到已有模型: {model_path}")
best_model = CrossValidatorModel.load(model_path)
else:
print("[模型训练] 初始化模型...")
# 优化模型参数
models = {
"LinearRegression": LinearRegression(
featuresCol="features",
labelCol="trip_duration",
maxIter=50,
regParam=0.01,
elasticNetParam=0.8
),
"GradientBoosting": GBTRegressor(
featuresCol="features",
labelCol="trip_duration",
maxIter=50,
maxDepth=5,
maxBins=32,
stepSize=0.05,
seed=42
),
"RandomForest": RandomForestRegressor(
featuresCol="features",
labelCol="trip_duration",
numTrees=30,
maxDepth=6,
maxBins=32,
subsamplingRate=0.8,
seed=42
)
}
# 交叉验证并行化
cv_models = {}
for name, model in models.items():
print(f"\n[模型训练] ====== 开始训练 {name} 模型 ======")
# 创建完整管道
pipeline_stages = indexers + [assembler, scaler, model]
model_pipeline = Pipeline(stages=pipeline_stages)
# 交叉验证配置
cv = CrossValidator(
estimator=model_pipeline,
estimatorParamMaps=ParamGridBuilder().build(),
evaluator=RegressionEvaluator(
labelCol="trip_duration",
metricName="rmse",
predictionCol="prediction"
),
numFolds=3,
parallelism=4 # 匹配Executor核心数
)
# 训练模型
print(f"[模型训练] 开始训练 {name} 模型...")
start_time = datetime.now()
cv_model = cv.fit(train_data)
training_time = datetime.now() - start_time
print(f"[模型训练] {name} 模型训练完成, 耗时: {training_time}")
cv_models[name] = cv_model
# 选择最佳模型
if cv_models:
best_model_name = min(cv_models.keys(),
key=lambda x: cv_models[x].avgMetrics[0])
best_model = cv_models[best_model_name]
print(f"[模型选择] 最佳模型为 {best_model_name}")
# 保存模型
print(f"[模型保存] 保存模型到: {model_path}")
best_model.write().overwrite().save(model_path)
else:
raise RuntimeError("所有模型训练均失败")
# ======================== 模型评估 ========================
print("[模型评估] 评估最佳模型...")
predictions = best_model.transform(test_data)
# 计算评估指标
evaluator_rmse = RegressionEvaluator(labelCol="trip_duration", metricName="rmse")
evaluator_mae = RegressionEvaluator(labelCol="trip_duration", metricName="mae")
evaluator_r2 = RegressionEvaluator(labelCol="trip_duration", metricName="r2")
rmse = evaluator_rmse.evaluate(predictions)
mae = evaluator_mae.evaluate(predictions)
r2 = evaluator_r2.evaluate(predictions)
print(f"[模型评估] RMSE: {rmse:.2f}分钟, MAE: {mae:.2f}分钟, R2: {r2:.4f}")
# ======================== 结果保存 ========================
print("[结果保存] 保存预测结果...")
output_path = "hdfs:///models/nyc_taxi_predictions"
predictions.select(
"pickup_ts", "dropoff_ts", "trip_duration", "prediction",
"PULocationID", "DOLocationID", "trip_distance", "haversine_distance"
).write.mode("overwrite").parquet(output_path)
print(f"[结果保存] 预测结果保存至: {output_path}")
# ======================== 可视化 ========================
print("[可视化] 生成结果图表...")
os.makedirs("/taxi/results", exist_ok=True)
# 散点图
sample_pd = predictions.sample(0.01).toPandas()
plt.figure(figsize=(10, 7))
plt.scatter(sample_pd["trip_duration"], sample_pd["prediction"], alpha=0.3)
plt.plot([0, 180], [0, 180], 'r--')
plt.xlabel("实际行程时间(分钟)")
plt.ylabel("预测行程时间(分钟)")
plt.title("实际 vs 预测行程时间")
plt.grid(True)
plt.savefig("/taxi/results/prediction_scatter.png")
print("[可视化] 预测结果散点图保存至: /taxi/results/prediction_scatter.png")
# 误差分布图
sample_pd["error"] = sample_pd["prediction"] - sample_pd["trip_duration"]
plt.figure(figsize=(10, 6))
plt.hist(sample_pd["error"], bins=50, alpha=0.7)
plt.xlabel("预测误差(分钟)")
plt.ylabel("频率")
plt.title("预测误差分布")
plt.grid(True)
plt.savefig("/taxi/results/error_distribution.png")
print("[可视化] 误差分布图保存至: /taxi/results/error_distribution.png")
print("[完成] 所有处理步骤已完成!")
except Exception as e:
error_msg = f"主程序异常: {str(e)}"
log_error(error_msg, sampled_df)
sys.exit(1)
finally:
# 安全释放所有缓存
print("[资源清理] 开始释放资源...")
try:
# 释放缓存的数据集
for df_ref in [train_data, test_data, sampled_df]:
if df_ref is not None and df_ref.is_cached:
df_ref.unpersist()
print(f"[资源清理] 释放缓存数据集: {df_ref}")
except Exception as e:
print(f"[资源清理] 释放缓存失败: {str(e)}")
# 停止Spark会话
try:
if spark:
spark.stop()
print("[资源清理] Spark会话已停止")
except:
pass
if __name__ == "__main__":
print("="*50)
print("开始执行优化版纽约出租车行程时间预测")
print("="*50)
try:
main()
except Exception as e:
print(f"主程序崩溃: {str(e)}")
log_error(f"主程序崩溃: {str(e)}")
finally:
print("="*50)
print("程序执行完成")
print("="*50)
最新发布