python+MySql -set up

本文介绍如何使用sudo命令在Linux环境下安装MySQL服务器、客户端及其开发库,并通过Python包进行扩展。此外,还详细说明了如何设置MySQL允许远程连接,包括修改配置文件、授权远程访问用户及刷新权限。
sudo apt-get install mysql-server
Sudo apt-get install  mysql-client
apt-get install libmysqlclient-dev
下载地址:https://pypi.python.org/pypi/MySQL-python/
下载MySQL-python-1.2.5.zip 文件之后直接解压。进入MySQL-python-1.2.5目录:

>>python setup.py install

mysql远程连接设置:


1:在目录/etc/mysql下找到my.cnf,用vim编辑,找到my.cnf里面的   

bind-address           = 127.0.0.1:

2:然后用root登陆Mysql数据库。如图:

输入 mysql -u root -p

提示输入密码 输入mysql 的密码

3:

然后在mysql>这里输入:grant all on *.* to username@'%' identified by 'password;

注意:username是用户名,password是要自己填写的一个名字,例如:

grant all on *.* to zl@'%' identified by '123456';,如图:

4:flush privileges;


OK


import os import numpy as np from pyspark import SparkContext from pyspark.ml.clustering import KMeans, KMeansModel from pyspark.ml.feature import VectorAssembler from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as F from pyspark.sql.functions import udf from pyspark.sql.types import StringType from pyspark.sql.types import StringType from cn.itcast.tags.base.BaseModelES import BaseModelAbstract SPARK_HOME = 'D:\\ProgramCj\\spark-2.4.8-bin-hadoop2.7' PYSPARK_PYTHON = 'D:\\ProgramCJ\\Python\\python' # 2-服务器路径 # SPARK_HOME = '/export/server/spark' # PYSPARK_PYTHON = '/root/anaconda3/envs/pyspark_env/bin/python' # 导入路径 os.environ['SPARK_HOME'] = SPARK_HOME os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON # TODO 0.准备Spark开发环境(重复) spark = SparkSession \ .builder \ .appName("TfecProfile") \ .master("local[*]") \ .getOrCreate() sc: SparkContext = spark.sparkContext # 构建Spark的Windows环境 SPARK_HOME = 'D:\\ProgramCj\\spark-2.4.8-bin-hadoop2.7' PYSPARK_PYTHON = 'D:\\ProgramCj\\Python\\python' os.environ['SPARK_HOME'] = SPARK_HOME os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON class Valuemodel(BaseModelAbstract): def getTagId(self): return 37 # inType=Elasticsearch##esNodes=up01:9200##esIndex=tfec_tbl_orders##esType=_doc##selectFields=memberid,ordersn,orderamount,finishtime def compute(self, esdf: DataFrame, fivedf: DataFrame): # esdf.show() # +----------+---------+-----------+--------------------+ # | finishtime | memberid | orderamount | ordersn | # +----------+---------+-----------+--------------------+ # | 1564669123 | 4035099 | 1139.0 | amazon_7965239286... | # | 1565406885 | 13823237 | 1669.0 | gome_795747706954617 | # | 1563555590 | 138230929 | 1099.0 | suning_7965732029... | # | 1564130740 | 13823365 | 1999.0 | amazon_7957041033... | # | 1563764893 | 13823017 | 1.0 | jd_14090322531369629 | # | 1565352355 | 4034959 | 1099.0 | jd_14090322532451531 | # | 1564320586 | 4033354 | 1388.0 | gome_795751300602358 | # fivedf.show() # +---+----+ # | id|rule| # +---+----+ # | 38| 1| 超高价值 # | 39| 2| # | 40| 3| # | 41| 4| # | 42| 5| # | 43| 6| # | 44| 7| 超低价值1 # todo 5根据会员id分组,获取每个会员最新的购买时间距离当前时间的天数,订单总金额,以及订单量 # mysql查表:SELECT FROM_UNIXTIME(finishtime, '%Y%m%d' ) FROM `tags_dat`.`tbl_orders`; 20200703到20200801,距离20220804相差733天 finishtimecolum = F.datediff(F.date_sub(F.current_date(), 980), F.from_unixtime(F.max(esdf['finishtime']))).alias('days') orderamountcolum = F.sum(esdf['orderamount']).alias('sumamount') ordersncolum = F.count(esdf['ordersn']).alias('cntsn') esdf1: DataFrame = esdf.groupby(esdf['memberid']).agg(finishtimecolum, orderamountcolum, ordersncolum) # esdf1.show() # +--------+----+------------------+-----+ # | memberid | days | sumamount | cntsn | # +--------+----+------------------+-----+ # | 13823489 | 49 | 202613.69989013672 | 116 | # | 26 | 49 | 212949.05004882812 | 111 | # | 4035177 | 49 | 217628.1700036619 | 125 | # | 29 | 49 | 249794.74002075195 | 137 | # todo 5-1对上述三列值打分(业务定打分规则),进行归一化操作(0-5) finishtimescore = F.when((esdf1['days'] < 20), 5) \ .when((esdf1['days'] >= 20) & (esdf1['days'] < 40), 4) \ .when((esdf1['days'] >= 40) & (esdf1['days'] < 60), 3) \ .when((esdf1['days'] >= 60) & (esdf1['days'] < 80), 2) \ .when((esdf1['days'] >= 80), 1) \ .otherwise(0) \ .alias('days') orderamountscore = F.when((esdf1['sumamount'] >= 850000), 5) \ .when((esdf1['sumamount'] >= 500000) & (esdf1['sumamount'] < 850000), 4) \ .when((esdf1['sumamount'] >= 350000) & (esdf1['sumamount'] < 500000), 3) \ .when((esdf1['sumamount'] >= 100000) & (esdf1['sumamount'] < 350000), 2) \ .when((esdf1['sumamount'] < 100000), 1) \ .otherwise(0) \ .alias('sumamount') ordersnscore = F.when((esdf1['cntsn'] >= 440), 5) \ .when((esdf1['cntsn'] >= 330) & (esdf1['cntsn'] < 440), 4) \ .when((esdf1['cntsn'] >= 220) & (esdf1['cntsn'] < 330), 3) \ .when((esdf1['cntsn'] >= 110) & (esdf1['cntsn'] < 220), 2) \ .when((esdf1['cntsn'] < 110), 1) \ .otherwise(0) \ .alias('cntsn') esdfuni = esdf1.select(esdf['memberid'], finishtimescore, orderamountscore, ordersnscore) # esdfuni.show() # +--------+----+---------+-----+ # | memberid | days | sumamount | cntsn | # +--------+----+---------+-----+ # | 13823489 | 3 | 4 | 2 | # | 26 | 3 | 4 | 2 | # | 4035177 | 3 | 4 | 3 | # todo 5-2组合成向量列vectorAssemble -->feature vector = VectorAssembler().setInputCols(['days', 'sumamount', 'cntsn']).setOutputCol('feature') vecesdf = vector.transform(esdfuni) # vecesdf.show() # | memberid | days | sumamount | cntsn | feature | # +--------+----+---------+-----+-------------+ # | 13823489 | 3 | 4 | 2 | [3.0, 4.0, 2.0] | # | 26 | 3 | 4 | 2 | [3.0, 4.0, 2.0] | # | 4035177 | 3 | 4 | 3 | [3.0, 4.0, 3.0] | # todo 5-3使用Kmeans算法对df进行训练 kMeans: KMeans = KMeans() \ .setK(7) \ .setSeed(10) \ .setMaxIter(2) \ .setFeaturesCol('feature') \ .setPredictionCol('predictstr') model: KMeansModel = kMeans.fit(vecesdf) # 根据模型预测结果 resultdf: DataFrame = model.transform(vecesdf) # resultdf.show() # +--------+----+---------+-----+-------------+----------+ # | memberid | days | sumamount | cntsn | feature | predictstr | # +--------+----+---------+-----+-------------+----------+ # | 13823489 | 3 | 4 | 2 | [3.0, 4.0, 2.0] | 2 | # | 26 | 3 | 4 | 2 | [3.0, 4.0, 2.0] | 2 | # | 4035177 | 3 | 4 | 3 | [3.0, 4.0, 3.0] | 5 | # todo 5-4获取聚类中心,,取和后,加索引,排序,然后predictstr列跟fivedf合并 center = model.clusterCenters() # print(center) # [array([3., 4.46686747, 4.33433735]), array([3., 5., 2.70192308]), array([2.99145299, 4.07692308, 1.8034188]),array([2.91044776, 2.43283582, 1.]), array([3., 3., 3.1875]), array([3., 4., 3.]), array([3., 3., 2.])] # todo 5-5 求和 list1 = [float(np.sum(x)) for x in center] # print(list1) # [8.088397790055248, 9.0, 12.958333333333332, 7.0, 11.906976744186046, 10.046511627906977, 8.0] # todo 5-6 加索引,形成字典 dict1 = {} for i in range(len(list1)): dict1[i] = list1[i] # print(dict1) # todo 5-7 转为列表,==>df==>rdd排序==?合并 list2 = [[k, v] for (k, v) in dict1.items()] # print(list2) #[[0, 8.088397790055248], [1, 9.0], [2, 12.958333333333332], [3, 7.0], [4, 11.906976744186046], [5, 10.046511627906977], [6, 8.0]] centerdf: DataFrame = spark.createDataFrame(list2, ['predict', 'center']) # centerdf.show() # +-------+------------------+ # | predict | center | # +-------+------------------+ # | 0 | 8.088397790055248 | # | 1 | 9.0 | # | 2 | 12.958333333333332 | # | 3 | 7.0 | # | 4 | 11.906976744186046 | # | 5 | 10.046511627906977 | # | 6 | 8.0 | # +-------+------------------+ # 转化为如下结构,方便打标签 # | predict | center | # +-------+------------------+ # | 0 | 11.801204819277109 | # | 1 | 10.701923076923077 | # | 5 | 10.0 | # | 4 | 9.1875 | # | 2 | 8.871794871794874 | # | 6 | 8.0 | # | 3 | 6.343283582089552 | centersortrdd = centerdf.rdd.repartition(1).sortBy(lambda x: x[1], ascending=False) print("sort partition") centersortrdd.foreach(lambda x:print(x)) # Row(predict=4, center=11.906976744186046) # Row(predict=5, center=10.046511627906977) # Row(predict=1, center=9.0) # Row(predict=0, center=8.088397790055248) # Row(predict=6, center=8.0) # Row(predict=3, center=7.0) # # todo 5-7 合并fivedf,,先union一个空rdd,再降分区,map,最后zip temprdd = centersortrdd.union(sc.parallelize([])) unionrdd = temprdd.repartition(1).map(lambda x: x).zip(fivedf.rdd.repartition(1)) unionrdd.foreach(lambda x:print(x)) # (Row(predict=2, center=12.958333333333332), Row(id=38, rule='1')) # (Row(predict=4, center=11.906976744186046), Row(id=39, rule='2')) # (Row(predict=5, center=10.046511627906977), Row(id=40, rule='3')) # (Row(predict=1, center=9.0), Row(id=41, rule='4')) # (Row(predict=0, center=8.088397790055248), Row(id=42, rule='5')) # (Row(predict=6, center=8.0), Row(id=43, rule='6')) # (Row(predict=3, center=7.0), Row(id=44, rule='7')) # # todo 5-8 合并后的rdd,取predict列和fivedf的id列,形成字典 fivedict = unionrdd.map(lambda row: (row[0][0], row[1][0])).collectAsMap() print(fivedict) # {2: 38, 4: 39, 5: 40, 1: 41, 0: 42, 6: 43, 3: 44} # # todo 5-9 根据predict列和id列的对应关系,,匹配resultdf和fivedf newdf: DataFrame = resultdf.select(resultdf['memberid'].alias('userId'), udf(lambda x: fivedict[x], returnType=StringType())(resultdf['predictstr']).alias('tagsId')) newdf.show() # # +--------+------+ # # | memberid | tagsId | # # +--------+------+ # # | 13823489 | 42 | # # | 26 | 42 | # # | 4035177 | 40 | # # | 29 | 38 | # # | 4033555 | 38 | # # | 4034769 | 40 | # # | 4034679 | 40 | # # | 4034681 | 41 | # # | 4034239 | 39 | # # | 4035069 | 43 | # # | 13822793 | 38 | # # | 13823131 | 40 | return newdf # # rsdf # # +------+----------+ # # | userId | tagsId | # # +------+----------+ # # | 26 | 12, 20, 42, 6 | # # | 29 | 17, 38, 6, 11 | # # | 65 | 17, 8, 5, 40 | # # | 19 | 11, 42, 19, 6 | # # | 54 | 12, 5, 17, 38 | # # | 22 | 20, 39, 6, 13 | # # | 7 | 12, 17, 38, 6 | # # | 77 | 43, 8, 17, 6 | if __name__ == '__main__': valuemodel = Valuemodel() valuemodel.execute()代码含义
最新发布
07-05
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值