在spark中通过UDF转字符串ip

本文介绍如何在Spark中将字符串类型的IP地址转换为长整型IP,通过正则表达式验证IP格式,然后将每个部分转换为长整型并左移位运算,实现IP地址的有效转化。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

今天在spark中需要将字符型(String)的ip转化为长整型(long)的ip,参考了两篇文章https://blog.youkuaiyun.com/cjuexuan/article/details/54912215https://blog.youkuaiyun.com/key_xyes/article/details/79818196,通过这两篇文章的抽取出思路。于是封装成UDF函数,如下:

sqlContext.udf.register("Ip2Long",(ip:String)=>{
      ip match {
        case i if i.matches("""^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$""")=>{
          var ip_long = 0l
          var parts = i.toString.trim.split(Pattern.quote("."))
          for(i <- parts.length to 1 by -1) {
            ip_long = ip_long << 8
            ip_long |= parts(i - 1).toLong
          }
          ip_long
        }
        case _=>0
      }
    })

这样,我就可以在sql中使用我自定义的函数了。

var df = spark.sql("select ip, Ip2Long(ip), region from mytable").toDF("ipStr", "ipInt", "region")

在此作为小标记,以示记忆。

from pyspark.sql import SparkSession from pyspark.sql.functions import col from pyspark.sql.functions import udf from pyspark.sql.types import StringType from pyspark.sql.functions import current_timestamp, date_format import re import requests import torch from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, ) # 加载本地模型和分词器 model_path = './data' model = AutoModelForSequenceClassification.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) print(f"模型加载自: {model_path}") print(f"模型架构: {type(model)}") print(f"分词器: {type(tokenizer)}") classifier = pipeline('text-classification', model=model, tokenizer=tokenizer) ## mac m1/m2/m3 使用 mps , windows 使用 cuda device = torch.device("mps" if torch.backends.mps.is_available() else "cuda") model.eval() # Set model to evaluation mode def predict_sentiment(headlines): # Prepare input inputs = tokenizer(headlines, return_tensors="pt", truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits probabilities = torch.softmax(logits, dim=-1) predicted_class_index = probabilities.argmax(dim=-1).item() # Mapping class index to label sentiment_labels = {0: "negative", 1: "neutral", 2: "positive"} return sentiment_labels[predicted_class_index] # 创建SparkSession - 添加额外配置确保兼容性 spark = SparkSession.builder \ .appName("KafkaNumberAdder") \ .master("spark://spark-master:7077") \ .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1," "org.apache.kafka:kafka-clients:3.3.2," "com.clickhouse:clickhouse-jdbc:0.6.0," "org.apache.httpcomponents.client5:httpclient5:5.3.1") \ .config("spark.ui.port", "4041") \ .config("spark.eventLog.enabled", "false") \ .config("spark.sql.streaming.checkpointLocation", "file:////app/logs") \ .config("spark.hadoop.fs.defaultFS", "file:///") \ .config("spark.driver.extraJavaOptions", "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED " "--add-opens=java.base/java.lang=ALL-UNNAMED") \ .config("spark.executor.extraJavaOptions", "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED " "--add-opens=java.base/java.lang=ALL-UNNAMED") \ .getOrCreate() # 打印关键环境信息 print(f"Spark version: {spark.version}") print(f"Spark master: {spark.sparkContext.master}") # 本地资源优化配置 spark.conf.set("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true") spark.sparkContext.setLogLevel("WARN") # 从Kafka主题读取数据 df = spark.readStream \ .format("kafka") \ .option("kafka.bootstrap.servers", "kafka:29092") \ .option("subscribe", "finance-news") \ .option("startingOffsets", "earliest") \ .option("failOnDataLoss", "false") \ .load() print("Kafka stream successfully initialized") ## ================== 数据处理 ================================================ ## 列名需要与 clickhouse 数据库表字段名一致 # 转换 key 和 value 从二进制到字符串 df_transformed = df.select( #col("key").cast("string").alias("key"), 目前不需要 key,可自行添加 col("value").cast("string").alias("news_text") ) # 注册 UDF predict_sentiment_udf = udf(predict_sentiment, StringType()) # 应用 NLP 模型进行分析 df_transformed = df_transformed.withColumn("prediction_result", predict_sentiment_udf(col("news_text"))) # 传入 value 列 # 加入当天日期 df_transformed = df_transformed.withColumn("timestamp",date_format(current_timestamp(),"yyyy-MM-dd HH:mm:ss")) # 配置写入 ClickHouse 的数据流,但是 clickhouse 不支持dataframe的流式写入,所以我们使用foreachBatch 微批次流写入 # epoch_id 是每个批次的唯一标识符 必须写入,否则会报错 # 定义 ClickHouse 的连接参数 url = "jdbc:clickhouse://clickhouse-server:8123/default" properties = { "user": "jsk", "password": "110124", "driver": "com.clickhouse.jdbc.ClickHouseDriver", "batchsize": "1000", # 调整批处理大小 "isolationLevel": "NONE", # 禁用事务 "rewriteBatchedStatements": "true", # 启用批量写入 } def write_to_clickhouse(batch_df, epoch_id): try: # 显示批次信息,便于调试 print(f"Processing batch {epoch_id} with {batch_df.count()} records") # 移除可能导致权限问题的checkpointLocation选项 batch_df.write \ .format("jdbc") \ .option("url", url) \ .option("dbtable", "news_analysis") \ .options(**properties) \ .mode("append") \ .save() print(f"Batch {epoch_id} successfully written to ClickHouse.") except Exception as e: print(f"Error writing batch {epoch_id} to ClickHouse: {str(e)}") # 打印更详细的堆栈信息,便于定位问题 import traceback print(traceback.format_exc()) # 创建流查询时设置checkpointLocation,而不是在JDBC选项中 query = df_transformed.writeStream \ .outputMode("append") \ .option("checkpointLocation", "./checkpoint/analysis") \ .foreachBatch(write_to_clickhouse) \ .trigger(processingTime="10 seconds") \ .start() query.awaitTermination()这是我的业务逻辑,在这里面会有什么造成权限不允许吗
最新发布
06-29
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值