- 博客(34)
- 收藏
- 关注
原创 评分卡模型(二数据清洗)
# -*- coding: utf-8 -*-"""Created on Sun Sep 16 19:04:53 2018@author: wangxihe"""import osimport pandas as pdimport numbersimport numpy as npimport matplotlib.pyplot as plt#%%os.chdir(r'E...
2018-09-20 09:09:21
782
原创 评分卡模型-(一特征构建)
# -*- coding: utf-8 -*-"""Created on Sun Sep 16 09:24:18 2018@author: wangxihe"""import osimport pandas as pdimport datetimeimport matplotlib.pyplot as pltimport collectionsimport numpy as...
2018-09-20 09:02:01
791
原创 Python实现批量转word和excel工具
# -*- coding: utf-8 -*-"""Created on Thu Aug 9 14:21:16 2018@author: wangxihe"""try: import os os.chdir(r'D:\ExpPdf') from win32com.client import Dispatch,constants,gencacheexcept Exc...
2018-08-10 13:49:59
3129
原创 机器学习二(二分类问题)
# -*- coding: utf-8 -*-"""Created on Mon Aug 6 20:37:19 2018@author: wangxihe"""import pandas as pdimport numpy as npimport matplotlib.pyplot as pltfrom scipy import stats from statsmodels...
2018-08-07 14:41:13
3831
1
原创 机器学习一(回归预测)
# -*- coding: utf-8 -*-"""Created on Mon Aug 6 08:48:58 2018@author: wangxihe"""#%%首先我们使用传统的统计学回归方法,然后在使用多中机器学习#数据说明#CRIM:城镇人均犯罪率。#ZN:住宅用地超过 25000 sq.ft. 的比例。#INDUS:城镇非零售商用土地的比例。#CHAS:查理斯河...
2018-08-07 08:49:02
1342
原创 检验(两样本T检验、相关分析、方差分析)(数据挖掘笔记一)
# -*- coding: utf-8 -*-"""Created on Sat Jul 28 13:40:57 2018@author: wangxihe"""#%%import pandas as pdimport statsmodels.api as smimport osimport numpy as npimport matplotlib.pyplot as plt...
2018-07-29 15:13:10
11810
原创 机器学习(1)KNN算法手写体识别
from numpy import *import operatorfrom os import listdir#从列方向扩展#tile(a,(size,1))def knn(k,testdata,traindata,labels): traindatasize=traindata.shape[0] dif=tile(testdata,(traindatasize,1))...
2018-06-02 20:14:11
424
原创 计算分本相似度jieba ,wordcloud ,gensim
'''1、读取文档2、对要计算的多篇文档进行分词3、对文档进行整理成指定格式,方便后续进行计算4、计算出词语的频率5【可选】、对频率低的词语进行过滤6、通过语料库建立词典7、加载要对比的文档8、将要对比的文档通过doc2bow转化为稀疏向量9、对稀疏向量进行进一步处理,得到新语料库10、将新语料库通过tfidfmodel进行处理,得到tfidf11、通过token2id得到特...
2018-06-02 12:23:14
806
原创 jieba分词,分词,解析词性。
import jieba#全模式sentence="教育系统耕耘,在重庆大学从学生成长为校长,2004年7月被明确为副部长级2010年调任武汉大学校长,任教育部副部长、党组成员"words1=jieba.cut(sentence,cut_all=True)print(words1)for word in words1: print(word)print("===========...
2018-06-01 16:12:47
2100
原创 糗事百科多线程介绍
import urllib.requestimport urllib.errorimport reimport threadingheaders = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537....
2018-05-31 19:32:58
285
原创 urllib多线程介绍代码实现
import threading # 导入多线程包class A(threading.Thread): # 创建一个多线程A def __init__(self): # 必须包含的两个方法之一:初始化线程 threading.Thread.__init__(self) def run(self): # 必须包含的两个方法之一:线程运行方法 ...
2018-05-31 19:25:44
478
原创 普通爬虫(糗事百科)
import urllib.requestimport urllib.errorimport reheaders = ("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36")opener = url...
2018-05-31 19:23:49
191
原创 微信爬虫,爬取网页信息(使用代理和模拟浏览器)
#http://weixin.sogou.com/import reimport urllib.requestimport timeimport urllib.errorimport urllib.requestimport scipy#自定义函数,功能为使用代理服务器爬一个网址def use_proxy(proxy_addr,url): #建立异常处理机制 tr...
2018-05-31 18:45:00
5780
原创 自动登录豆瓣(不出现验证码情况)
import urllib.requestimport xlsxwriterimport re#模拟post请求import urllib.parse, urllib.request, http.cookiejar, recookie = http.cookiejar.CookieJar()cookieProc = urllib.request.HTTPCookieProcesso...
2018-05-30 22:43:51
1059
原创 Python xlrd,xlsxwriter操作 excel
import xlrd,xlwt#打开excel文件并获取所有sheetworkbook = xlrd.open_workbook(r'D:\1.xlsx')sheetlist=workbook.sheet_names()# print (sheetlist)coldata=[]for ele in sheetlist: sheet=workbook.sheet_by_name...
2018-05-30 13:25:17
1479
1
原创 python正则表达式
import re## ^ 匹配开始# $ 匹配行尾# . 匹配出换行符以外的任何单个字符# [......] 匹配括号内任何当个字符# [^......] 匹配单个字符或多个字符不在括号内# * 匹配0个或多个匹配前面的表达式# + 匹配1个或多个前面出现的表达式# ? 匹配0次或1次前面出现的表达式# {n} 精确匹配前面出现的表达式的数量# {n,m} 匹配至...
2018-05-30 10:31:56
144
原创 python英文分词及字典排序
speak='''Chief Justice Roberts, President Carter, President Clinton, President Bush, President Obama, fellow Americans and people of the world, thank you.We, the citizens of America, are now joined in...
2018-05-28 14:40:31
2029
原创 聚类分析
import pandas as pdaimport numpy as npimport missingnoimport matplotlib.pyplot as pltimport seaborn as sns#读入数据data=pda.read_csv("114_congress.csv")#显示前几行print(data.head())#查看缺失值missingno....
2018-05-02 16:14:39
377
原创 用户流失预测(KNN SVC RF)
import pandas as pdaimport numpy as npimport missingnoimport matplotlib.pyplot as pltuserData=pda.read_csv("churn.csv")print(userData.shape)# print(userData.describe())# print(userData.colum...
2018-05-02 16:04:20
929
原创 信用卡异常检查(过采样,下采样、逻辑回归,混淆矩阵)
import pandas as pdaimport numpy as npimport matplotlib.pyplot as pltimport itertoolsimport missingnodata=pda.read_csv("creditcard.csv")# print(data.head())count_class=pda.value_counts(data.C...
2018-05-02 15:52:46
1334
原创 PCA 降维
import numpy as npimport pandas as pdafrom sklearn.datasets import load_irisimport matplotlib.pyplot as plt#加载数据iris=load_iris()# print(iris)data=iris["data"]labels=iris["target"]# print(da...
2018-05-02 09:55:19
267
原创 python 回归拟合图形展示
import numpy as npimport pandas as pdaimport matplotlib.pyplot as pltimport matplotlib as mplimport seaborn as snssns.set()#color_codes=Truenp.random.seed(sum(map(ord,"regression")))tips=sns.l...
2018-04-29 17:30:40
2215
原创 Python多变量图形展示
from __future__ import absolute_import,division,print_functionimport matplotlib as mplimport matplotlib.pyplot as pltfrom matplotlib.pyplot import GridSpecimport seaborn as snsimport numpy as np...
2018-04-29 17:20:23
1439
原创 pandas_profiling 数据报表展示
from __future__ import absolute_import,division,print_functionimport matplotlib as mplimport matplotlib.pyplot as pltfrom matplotlib.pyplot import GridSpecimport seaborn as snsimport numpy as n...
2018-04-29 10:37:11
1136
原创 Python Seaborn画图库代码整理
import seaborn as snsimport matplotlib.pyplot as pltimport numpy as np#构造数据def sinplot(flip=1): x=np.linspace(0,14,100) print(x) for i in range(1,7): plt.plot(x,np.sin(x+i*0.5...
2018-04-29 09:33:23
782
原创 Python可视化分析球员裁判数据(二、单变量分析,缺失值可视化)
from __future__ import absolute_import,division,print_functionimport matplotlib as mplimport matplotlib.pyplot as pltfrom matplotlib.pyplot import GridSpecimport seaborn as snsimport numpy as n...
2018-04-28 20:20:13
888
原创 Python可视化分析球员裁判数据(一)
from __future__ import absolute_import,division,print_functionimport matplotlib as mplimport matplotlib.pyplot as pltfrom matplotlib.pyplot import GridSpecimport seaborn as snsimport numpy as n...
2018-04-28 20:17:15
986
原创 自定义Python缺失值检查函数
import pandas as pdaimport numpy as np#数据预处理data=pda.read_csv("titanic_train.csv")print(data.columns)#自定义缺失值检测函数def not_null_num(column): column_null=pda.isnull(column) column_notnull...
2018-04-28 14:37:35
1315
原创 python数据预处理 缺失值,指标统计
import pandas as pdaimport numpy as np#数据预处理data=pda.read_csv("titanic_train.csv")print(data.columns)#缺失值# print(data[pda.isnull(data["Age"])])# #非缺失值# print(data[pda.notnull(data["Age"])])...
2018-04-28 13:37:09
7547
原创 Python取行和列数据及切片操作
import pandas as pdadata=pda.read_csv("food_info.csv")print(type(data))#<class 'pandas.core.frame.DataFrame'>print(data.describe())#描述信息print(data.dtypes)#各字段信息print(data.head())# 取前5条数据...
2018-04-28 11:03:06
13431
1
原创 Python数组排序
import numpy as np#创建一个二维数组data=np.sin(np.arange(20)).reshape(5,4)print("data:")print(data)# [[ 0. 0.84147098 0.90929743 0.14112001]# [-0.7568025 -0.95892427 -0.2794155 0.656986...
2018-04-28 10:20:17
1105
原创 Python中的赋值、浅拷贝、深拷贝之间的区别
1.赋值: 只是复制了新对象的引用,不会开辟新的内存空间。2.浅拷贝: 创建新对象,其内容是原对象的引用,view3.深拷贝:只有一种形式,copy模块中的deepcopy函数。 和浅拷贝对应,深拷贝拷贝了对象的所有元素,包括多层嵌套的元素。 深拷贝出来的对象是一个全新的对象,不再与原来的对象有任何关联。import numpy as npa=np.array([12,10,11,...
2018-04-28 09:19:40
103
原创 Python Numpy学习总结
numpy 是一个 Python 包。 它代表 “Numeric Python”。 它是一个由多维数组对象和用于处理数组的例程集合组成的库import numpy as np#定义一个一维数组a1=np.array([1,42,3,4,5,6])print("结果:",a1)#定义一个二维数组a2=np.array([ [1,2,3], ["a","b","c"], ["1a...
2018-04-26 15:35:07
402
空空如也
空空如也
TA创建的收藏夹 TA关注的收藏夹
TA关注的人