mysql建库建表
>create database nginx;
>use nginx
> create table url_access (id int NOT NULL AUTO_INCREMENT primary key, timestamp varchar(256), url varchar(256), pv long);
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/01/09 23:30
# @Author : xuanda
# @File : kafka_to_sparkstreaming_to_mysql.py
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import MySQLdb, time
def save(x):
row=x.collect()
conn=MySQLdb.connect(host="192.168.1.1", port=3306, user="root", passwd="test",charset='utf8')
cur=conn.cursor()
try:
conn.commit()
except BaseException as err:
print('Exception: ', err)
conn.rollback()
timestamp = int(round(time.time()))
for url, pv in row:
try:
sql="insert into nginx.url_access (timestamp, url, pv) values ('%s', '%s', '%s')" % (timestamp, url, pv)
cur.execute(sql)
conn.commit()
except BaseException as err:
print('Exception: ', err)
conn.rollback()
if __name__ == '__main__':
zkQuorum = '192.168.1.20:2181'
topic = {'nginx-access-log': 1}
groupid = "kafka-to-sparkstreaming"
appName = "KafkaToSparkstreaming"
timecell = 1
# init spark streaming
sc = SparkContext(master="spark://192.168.1.20:7077", appName=appName)
ssc = StreamingContext(sc, timecell)
# Create an input stream that pulls messages from a Kafka Broker
lines = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
url_add_reduce = lines.map(lambda x:x[1]).map(lambda line:line.split(" ")).map(lambda line:line[8]).map(lambda w:(str(w),1)).reduceByKey(lambda x,y:x+y)
url_add_reduce.foreachRDD(save)
ssc.start()
ssc.awaitTermination()
启动
spark-2.4.0-bin-hadoop2.7/bin/spark-submit --jars /tmp/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar kafka_to_sparkstreaming_to_mysql.py
mysql
明天写个简单页面展示下统计效果