mysql建库建表
>create database nginx;
>use nginx
> create table url_access (id int NOT NULL AUTO_INCREMENT primary key, timestamp varchar(256), url varchar(256), pv long);
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/01/09 23:30
# @Author : xuanda
# @File : kafka_to_sparkstreaming_to_mysql.py
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import MySQLdb, time
def save(x):
row=x.collect()
conn=MySQLdb.connect(host="192.168.1.1", port=3306, user="root", passwd="test",charset='utf8')
cur=conn.cursor()
try:
conn.commit()
except BaseException as err:
print('Exception: ', err)
conn.rollback()
timestamp = int(round(time.time()))
for url, pv in row:
try:
sql="insert into nginx.url_access (timestamp, url, pv) values ('%s', '%s', '%s')" % (timestamp, url, pv)
cur.execute(sql)
conn.commit()
except BaseException as err:
print('Exception: ', err)
conn.rollback()
if __name__ == '__main__':
zkQuorum = '192.168.1.20:2181'
topic = {'nginx-access-log': 1}
groupid = "kafka-to-sparkstreaming"
appName = "KafkaToSparkstreaming"
timecell = 1
# init spark streaming
sc = SparkContext(master="spark://192.168.1.20:7077", appName=appName)
ssc = StreamingContext(sc, timecell)
# Create an input stream that pulls messages from a Kafka Broker
lines = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
url_add_reduce = lines.map(lambda x:x[1]).map(lambda line:line.split(" ")).map(lambda line:line[8]).map(lambda w:(str(w),1)).reduceByKey(lambda x,y:x+y)
url_add_reduce.foreachRDD(save)
ssc.start()
ssc.awaitTermination()
启动
spark-2.4.0-bin-hadoop2.7/bin/spark-submit --jars /tmp/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar kafka_to_sparkstreaming_to_mysql.py
mysql

明天写个简单页面展示下统计效果
Python Spark Streaming:Kafka Nginx日志实时分析与MySQL存储
使用Python Spark Streaming从Kafka中获取Nginx日志,实现秒级URL PV统计,并将结果存入MySQL数据库。首先创建了名为'nginx'的数据库和'url_access'表,然后通过`spark-submit`命令运行脚本`kafka_to_sparkstreaming_to_mysql.py`。后续计划构建简单的页面展示统计效果。
1279

被折叠的 条评论
为什么被折叠?



