本文使用上篇文章中生成的稀疏向量进行建模。
因from pyspark_lightgbm import LGBMClassifier和from synapse.ml.lightgbm import LightGBMClassifier在集群上均未安装,故使用原生lgb进行建模。(理论上前两者效率更优,可并行处理数据,而原生lgb只能单机处理)
# 网格寻参
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import os
import sys
import time
import numpy as np
import lightgbm as lgb
import joblib
from scipy.sparse import csr_matrix
from sklearn.metrics import (
roc_auc_score,
average_precision_score,
f1_score,
precision_score,
recall_score,
accuracy_score,
confusion_matrix
)
from sklearn.model_selection import train_test_split, GridSearchCV
# 配置环境变量
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
spark = SparkSession.builder.config("spark.metrics.conf",
"/opt/mobdata/spark/spark-2.4.3.mob1-bin-2.6.5/conf/metrics.properties") \
.config("spark.driver.memory", "48g") \
.config("spark.driver.maxResultSize", "16g") \
.appName("test_djj") \
.enableHiveSupport() \
.getOrCreate()
# 计时装饰器
def timeit(func):
def wrapper(*args, **kwargs):
start_time = time.time()
print(f"开始: {func.__name__}...")
result = func(*args, **kwargs)
end_time = time.time()
elapsed = end_time - start_time
print(f"完成: {func.__name__} | 耗时: {elapsed:.2f}秒")
return result
return wrapper
# 1. 数据加载并转换为CSR矩阵
@timeit
def load_and_prepare_data():
print("加载数据并转换为CSR矩阵..."

最低0.47元/天 解锁文章
9125

被折叠的 条评论
为什么被折叠?



