第一步:统计user和event相关信息
#查看train_csv的数据
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head()
user | event | invited | timestamp | interested | not_interested | |
---|---|---|---|---|---|---|
0 | 3044012 | 1918771225 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
1 | 3044012 | 1502284248 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
2 | 3044012 | 2529072432 | 0 | 2012-10-02 15:53:05.754000+00:00 | 1 | 0 |
3 | 3044012 | 3072478280 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
4 | 3044012 | 1390707377 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user 15398 non-null int64
event 15398 non-null int64
invited 15398 non-null int64
timestamp 15398 non-null object
interested 15398 non-null int64
not_interested 15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB
#查看test_csv的数据
df_test = pd.read_csv('test.csv')
df_test.head()
user | event | invited | timestamp | |
---|---|---|---|---|
0 | 1776192 | 2877501688 | 0 | 2012-11-30 11:39:01.230000+00:00 |
1 | 1776192 | 3025444328 | 0 | 2012-11-30 11:39:01.230000+00:00 |
2 | 1776192 | 4078218285 | 0 | 2012-11-30 11:39:01.230000+00:00 |
3 | 1776192 | 1024025121 | 0 | 2012-11-30 11:39:01.230000+00:00 |
4 | 1776192 | 2972428928 | 0 | 2012-11-30 11:39:21.985000+00:00 |
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user 10237 non-null int64
event 10237 non-null int64
invited 10237 non-null int64
timestamp 10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB
- 前两列是用户ID和对应的event ID
- 而test.csv中用户缺少了标签(interested or not_interested)
#第一步的全部程序如下
from collections import defaultdict
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
#用于分析train和test中用户和事件之间的相关性。
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] ) #统计所有的用户有哪些
uniqueEvents.add( cols[1] ) #统计所有的事件有哪些
eventsForUser[cols[0]].add( cols[1] ) #将用户作为键值,保存下每个用户对应的事件
usersForEvent[cols[1]].add( cols[0] ) #将事件作为键值,保存下每个事件对应的用户
f.close()
self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores)
#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
#查找关联用户
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
#查找关联事件
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
第1步:统计user和event相关信息...
第1步完成...
pe.userEventScores
<3391x13418 sparse matrix of type '<class 'numpy.float64'>'
with 4645 stored elements in Dictionary Of Keys format>
说明:
- 其中PE_userEventScores.mtx是所有users和events的矩阵,但是里面的值只有train.csv的值,值是1或者-1
- scipy.sparse.dok_matrix()函数是产生一个稀疏矩阵,这样PE_userEventScores.mtx只保存了非0值
- 针对该步使用的变量作简单介绍:
- uniqueUsers:集合,保存train.csv和test.csv中的所有user ID
- uniqueEvents:集合,保存train.csv和test.csv中的所有event ID
- eventsForUser:字典,key为每个用户,value为该用户对应的event集合
- usersForEvent:字典,key为每个event,value为该event对应的user集合
- userIndex:字典,每个用户有个Index
- eventIndex:字典,每个event有个Index
- userEventScores:稀疏矩阵3391 * 13418,use vs event,矩阵元素为train.csv中
每个user对某个event的兴趣分(1, 0 or -1)即interested - not_interested
import pandas as pd
pd.DataFrame(userEventScores)
userEventScores:每个user对每个event的兴趣分(1, 0 or -1)
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['event']==1502284248]
import itertools
for each in itertools.combinations(set([3044012,1302145719,3194014105,3669515588]), 2):
print(each)
(3194014105, 3669515588)
(3194014105, 3044012)
(3194014105, 1302145719)
(3669515588, 3044012)
(3669515588, 1302145719)
(3044012, 1302145719)
uniqueUserPairs:集合,如果对于同一个event来说,关联上3个及3个以上users,则该event关联上的users进行两两配对,保存在uniqueUserPairs中,注意保存的是userId,而不是user对应的索引:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['user']==3044012]
user | event | invited | timestamp | interested | not_interested | |
---|---|---|---|---|---|---|
0 | 3044012 | 1918771225 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
1 | 3044012 | 1502284248 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
2 | 3044012 | 2529072432 | 0 | 2012-10-02 15:53:05.754000+00:00 | 1 | 0 |
3 | 3044012 | 3072478280 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
4 | 3044012 | 1390707377 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
5 | 3044012 | 1532377761 | 0 | 2012-10-02 15:53:05.754000+00:00 | 0 | 0 |
import itertools
for each in itertools.combinations(set([1918771225,1502284248,2529072432, 3072478280, 1390707377, 1532377761 ]), 2):
print(each)
(1532377761, 3072478280)
(1532377761, 2529072432)
(1532377761, 1390707377)
(1532377761, 1502284248)
(1532377761, 1918771225)
(3072478280, 2529072432)
(3072478280, 1390707377)
(3072478280, 1502284248)
(3072478280, 1918771225)
(2529072432, 1390707377)
(2529072432, 1502284248)
(2529072432, 1918771225)
(1390707377, 1502284248)
(1390707377, 1918771225)
(1502284248, 1918771225)
第二步:计算用户相似度
由于用到:users.csv,我们先看看其内容(看前10行)
import pandas as pd
df_users = pd.read_csv('users.csv')
df_users.head(10)
user_id | locale | birthyear | gender | joinedAt | location | timezone | |
---|---|---|---|---|---|---|---|
0 | 3197468391 | id_ID | 1993 | male | 2012-10-02T06:40:55.524Z | Medan Indonesia | 480.0 |
1 | 3537982273 | id_ID | 1992 | male | 2012-09-29T18:03:12.111Z | Medan Indonesia | 420.0 |
2 | 823183725 | en_US | 1975 | male | 2012-10-06T03:14:07.149Z | Stratford Ontario | -240.0 |
3 | 1872223848 | en_US | 1991 | female | 2012-11-04T08:59:43.783Z | Tehran Iran | 210.0 |
4 | 3429017717 | id_ID | 1995 | female | 2012-09-10T16:06:53.132Z | NaN | 420.0 |
5 | 627175141 | ka_GE | 1973 | female | 2012-11-01T09:59:17.590Z | Tbilisi Georgia | 240.0 |
6 | 2752000443 | id_ID | 1994 | male | 2012-10-03T05:22:17.637Z | Medan Indonesia | 420.0 |
7 | 3473687777 | id_ID | 1965 | female | 2012-10-03T12:19:29.975Z | Medan Indonesia | 420.0 |
8 | 2966052962 | id_ID | 1979 | male | 2012-10-31T10:11:57.668Z | Medan Indonesia | 420.0 |
9 | 264876277 | id_ID | 1988 | female | 2012-10-02T07:28:09.555Z | Medan Indonesia | 420.0 |
#使用locale和pycountry模块来将字符串转换成数值
import locale
locale.locale_alias
{'a3': 'az_AZ.KOI8-C',
'a3_az': 'az_AZ.KOI8-C',
'a3_az.koic': 'az_AZ.KOI8-C',
'aa_dj': 'aa_DJ.ISO8859-1',
'aa_er': 'aa_ER.UTF-8',
'aa_et': 'aa_ET.UTF-8',
'af': 'af_ZA.ISO8859-1',
'af_za': 'af_ZA.ISO8859-1',
'am': 'am_ET.UTF-8',
'am_et': 'am_ET.UTF-8',
'american': 'en_US.ISO8859-1',
'an_es': 'an_ES.ISO8859-15',
'ar': 'ar_AA.ISO8859-6',
'ar_aa': 'ar_AA.ISO8859-6',
'ar_ae': 'ar_AE.ISO8859-6',
'ar_bh': 'ar_BH.ISO8859-6',
'ar_dz': 'ar_DZ.ISO8859-6',
'ar_eg': 'ar_EG.ISO8859-6',
'ar_in': 'ar_IN.UTF-8',
'ar_iq': 'ar_IQ.ISO8859-6',
'ar_jo': 'ar_JO.ISO8859-6',
'ar_kw': 'ar_KW.ISO8859-6',
'ar_lb': 'ar_LB.ISO8859-6',
'ar_ly': 'ar_LY.ISO8859-6',
'ar_ma': 'ar_MA.ISO8859-6',
'ar_om': 'ar_OM.ISO8859-6',
'ar_qa': 'ar_QA.ISO8859-6',
'ar_sa': 'ar_SA.ISO8859-6',
'ar_sd': 'ar_SD.ISO8859-6',
'ar_sy': 'ar_SY.ISO8859-6',
'ar_tn': 'ar_TN.ISO8859-6',
'ar_ye': 'ar_YE.ISO8859-6',
'arabic': 'ar_AA.ISO8859-6',
'as': 'as_IN.UTF-8',
'as_in': 'as_IN.UTF-8',
'ast_es': 'ast_ES.ISO8859-15',
'ayc_pe': 'ayc_PE.UTF-8',
'az': 'az_AZ.ISO8859-9E',
'az_az': 'az_AZ.ISO8859-9E',
'az_az.iso88599e': 'az_AZ.ISO8859-9E',
'be': 'be_BY.CP1251',
'be@latin': 'be_BY.UTF-8@latin',
'be_bg.utf8': 'bg_BG.UTF-8',
'be_by': 'be_BY.CP1251',
'be_by@latin': 'be_BY.UTF-8@latin',
'bem_zm': 'bem_ZM.UTF-8',
'ber_dz': 'ber_DZ.UTF-8',
'ber_ma': 'ber_MA.UTF-8',
'bg': 'bg_BG.CP1251',
'bg_bg': 'bg_BG.CP1251',
'bho_in': 'bho_IN.UTF-8',
'bn_bd': 'bn_BD.UTF-8',
'bn_in': 'bn_IN.UTF-8',
'bo_cn': 'bo_CN.UTF-8',
'bo_in': 'bo_IN.UTF-8',
'bokmal': 'nb_NO.ISO8859-1',
'bokmål': 'nb_NO.ISO8859-1',
'br': 'br_FR.ISO8859-1',
'br_fr': 'br_FR.ISO8859-1',
'brx_in': 'brx_IN.UTF-8',
'bs': 'bs_BA.ISO8859-2',
'bs_ba': 'bs_BA.ISO8859-2',
'bulgarian': 'bg_BG.CP1251',
'byn_er': 'byn_ER.UTF-8',
'c': 'C',
'c-french': 'fr_CA.ISO8859-1',
'c.ascii': 'C',
'c.en': 'C',
'c.iso88591': 'en_US.ISO8859-1',
'c.utf8': 'en_US.UTF-8',
'c_c': 'C',
'c_c.c': 'C',
'ca': 'ca_ES.ISO8859-1',
'ca_ad': 'ca_AD.ISO8859-1',
'ca_es': 'ca_ES.ISO8859-1',
'ca_es@valencia': 'ca_ES.ISO8859-15@valencia',
'ca_fr': 'ca_FR.ISO8859-1',
'ca_it': 'ca_IT.ISO8859-1',
'catalan': 'ca_ES.ISO8859-1',
'cextend': 'en_US.ISO8859-1',
'chinese-s': 'zh_CN.eucCN',
'chinese-t': 'zh_TW.eucTW',
'crh_ua': 'crh_UA.UTF-8',
'croatian': 'hr_HR.ISO8859-2',
'cs': 'cs_CZ.ISO8859-2',
'cs_cs': 'cs_CZ.ISO8859-2',
'cs_cz': 'cs_CZ.ISO8859-2',
'csb_pl': 'csb_PL.UTF-8',
'cv_ru': 'cv_RU.UTF-8',
'cy': 'cy_GB.ISO8859-1',
'cy_gb': 'cy_GB.ISO8859-1',
'cz': 'cs_CZ.ISO8859-2',
'cz_cz': 'cs_CZ.ISO8859-2',
'czech': 'cs_CZ.ISO8859-2',
'da': 'da_DK.ISO8859-1',
'da_dk': 'da_DK.ISO8859-1',
'danish': 'da_DK.ISO8859-1',
'dansk': 'da_DK.ISO8859-1',
'de': 'de_DE.ISO8859-1',
'de_at': 'de_AT.ISO8859-1',
'de_be': 'de_BE.ISO8859-1',
'de_ch': 'de_CH.ISO8859-1',
'de_de': 'de_DE.ISO8859-1',
'de_li.utf8': 'de_LI.UTF-8',
'de_lu': 'de_LU.ISO8859-1',
'deutsch': 'de_DE.ISO8859-1',
'doi_in': 'doi_IN.UTF-8',
'dutch': 'nl_NL.ISO8859-1',
'dutch.iso88591': 'nl_BE.ISO8859-1',
'dv_mv': 'dv_MV.UTF-8',
'dz_bt': 'dz_BT.UTF-8',
'ee': 'ee_EE.ISO8859-4',
'ee_ee': 'ee_EE.ISO8859-4',
'eesti': 'et_EE.ISO8859-1',
'el': 'el_GR.ISO8859-7',
'el_cy': 'el_CY.ISO8859-7',
'el_gr': 'el_GR.ISO8859-7',
'el_gr@euro': 'el_GR.ISO8859-15',
'en': 'en_US.ISO8859-1',
'en_ag': 'en_AG.UTF-8',
'en_au': 'en_AU.ISO8859-1',
'en_be': 'en_BE.ISO8859-1',
'en_bw': 'en_BW.ISO8859-1',
'en_ca': 'en_CA.ISO8859-1',
'en_dk': 'en_DK.ISO8859-1',
'en_dl.utf8': 'en_DL.UTF-8',
'en_gb': 'en_GB.ISO8859-1',
'en_hk': 'en_HK.ISO8859-1',
'en_ie': 'en_IE.ISO8859-1',
'en_in': 'en_IN.ISO8859-1',
'en_ng': 'en_NG.UTF-8',
'en_nz': 'en_NZ.ISO8859-1',
'en_ph': 'en_PH.ISO8859-1',
'en_sg': 'en_SG.ISO8859-1',
'en_uk': 'en_GB.ISO8859-1',
'en_us': 'en_US.ISO8859-1',
'en_us@euro@euro': 'en_US.ISO8859-15',
'en_za': 'en_ZA.ISO8859-1',
'en_zm': 'en_ZM.UTF-8',
'en_zw': 'en_ZW.ISO8859-1',
'en_zw.utf8': 'en_ZS.UTF-8',
'eng_gb': 'en_GB.ISO8859-1',
'english': 'en_EN.ISO8859-1',
'english_uk': 'en_GB.ISO8859-1',
'english_united-states': 'en_US.ISO8859-1',
'english_united-states.437': 'C',
'english_us': 'en_US.ISO8859-1',
'eo': 'eo_XX.ISO8859-3',
'eo.utf8': 'eo.UTF-8',
'eo_eo': 'eo_EO.ISO8859-3',
'eo_us.utf8': 'eo_US.UTF-8',
'eo_xx': 'eo_XX.ISO8859-3',
'es': 'es_ES.ISO8859-1',
'es_ar': 'es_AR.ISO8859-1',
'es_bo': 'es_BO.ISO8859-1',
'es_cl': 'es_CL.ISO8859-1',
'es_co': 'es_CO.ISO8859-1',
'es_cr': 'es_CR.ISO8859-1',
'es_cu': 'es_CU.UTF-8',
'es_do': 'es_DO.ISO8859-1',
'es_ec': 'es_EC.ISO8859-1',
'es_es': 'es_ES.ISO8859-1',
'es_gt': 'es_GT.ISO8859-1',
'es_hn': 'es_HN.ISO8859-1',
'es_mx': 'es_MX.ISO8859-1',
'es_ni': 'es_NI.ISO8859-1',
'es_pa': 'es_PA.ISO8859-1',
'es_pe': 'es_PE.ISO8859-1',
'es_pr': 'es_PR.ISO8859-1',
'es_py': 'es_PY.ISO8859-1',
'es_sv': 'es_SV.ISO8859-1',
'es_us': 'es_US.ISO8859-1',
'es_uy': 'es_UY.ISO8859-1',
'es_ve': 'es_VE.ISO8859-1',
'estonian': 'et_EE.ISO8859-1',
'et': 'et_EE.ISO8859-15',
'et_ee': 'et_EE.ISO8859-15',
'eu': 'eu_ES.ISO8859-1',
'eu_es': 'eu_ES.ISO8859-1',
'eu_fr': 'eu_FR.ISO8859-1',
'fa': 'fa_IR.UTF-8',
'fa_ir': 'fa_IR.UTF-8',
'fa_ir.isiri3342': 'fa_IR.ISIRI-3342',
'ff_sn': 'ff_SN.UTF-8',
'fi': 'fi_FI.ISO8859-15',
'fi_fi': 'fi_FI.ISO8859-15',
'fil_ph': 'fil_PH.UTF-8',
'finnish': 'fi_FI.ISO8859-1',
'fo': 'fo_FO.ISO8859-1',
'fo_fo': 'fo_FO.ISO8859-1',
'fr': 'fr_FR.ISO8859-1',
'fr_be': 'fr_BE.ISO8859-1',
'fr_ca': 'fr_CA.ISO8859-1',
'fr_ch': 'fr_CH.ISO8859-1',
'fr_fr': 'fr_FR.ISO8859-1',
'fr_lu': 'fr_LU.ISO8859-1',
'français': 'fr_FR.ISO8859-1',
'fre_fr': 'fr_FR.ISO8859-1',
'french': 'fr_FR.ISO8859-1',
'french.iso88591': 'fr_CH.ISO8859-1',
'french_france': 'fr_FR.ISO8859-1',
'fur_it': 'fur_IT.UTF-8',
'fy_de': 'fy_DE.UTF-8',
'fy_nl': 'fy_NL.UTF-8',
'ga': 'ga_IE.ISO8859-1',
'ga_ie': 'ga_IE.ISO8859-1',
'galego': 'gl_ES.ISO8859-1',
'galician': 'gl_ES.ISO8859-1',
'gd': 'gd_GB.ISO8859-1',
'gd_gb': 'gd_GB.ISO8859-1',
'ger_de': 'de_DE.ISO8859-1',
'german': 'de_DE.ISO8859-1',
'german.iso88591': 'de_CH.ISO8859-1',
'german_germany': 'de_DE.ISO8859-1',
'gez_er': 'gez_ER.UTF-8',
'gez_et': 'gez_ET.UTF-8',
'gl': 'gl_ES.ISO8859-1',
'gl_es': 'gl_ES.ISO8859-1',
'greek': 'el_GR.ISO8859-7',
'gu_in': 'gu_IN.UTF-8',
'gv': 'gv_GB.ISO8859-1',
'gv_gb': 'gv_GB.ISO8859-1',
'ha_ng': 'ha_NG.UTF-8',
'he': 'he_IL.ISO8859-8',
'he_il': 'he_IL.ISO8859-8',
'hebrew': 'he_IL.ISO8859-8',
'hi': 'hi_IN.ISCII-DEV',
'hi_in': 'hi_IN.ISCII-DEV',
'hi_in.isciidev': 'hi_IN.ISCII-DEV',
'hne': 'hne_IN.UTF-8',
'hne_in': 'hne_IN.UTF-8',
'hr': 'hr_HR.ISO8859-2',
'hr_hr': 'hr_HR.ISO8859-2',
'hrvatski': 'hr_HR.ISO8859-2',
'hsb_de': 'hsb_DE.ISO8859-2',
'ht_ht': 'ht_HT.UTF-8',
'hu': 'hu_HU.ISO8859-2',
'hu_hu': 'hu_HU.ISO8859-2',
'hungarian': 'hu_HU.ISO8859-2',
'hy_am': 'hy_AM.UTF-8',
'hy_am.armscii8': 'hy_AM.ARMSCII_8',
'ia': 'ia.UTF-8',
'ia_fr': 'ia_FR.UTF-8',
'icelandic': 'is_IS.ISO8859-1',
'id': 'id_ID.ISO8859-1',
'id_id': 'id_ID.ISO8859-1',
'ig_ng': 'ig_NG.UTF-8',
'ik_ca': 'ik_CA.UTF-8',
'in': 'id_ID.ISO8859-1',
'in_id': 'id_ID.ISO8859-1',
'is': 'is_IS.ISO8859-1',
'is_is': 'is_IS.ISO8859-1',
'iso-8859-1': 'en_US.ISO8859-1',
'iso-8859-15': 'en_US.ISO8859-15',
'iso8859-1': 'en_US.ISO8859-1',
'iso8859-15': 'en_US.ISO8859-15',
'iso_8859_1': 'en_US.ISO8859-1',
'iso_8859_15': 'en_US.ISO8859-15',
'it': 'it_IT.ISO8859-1',
'it_ch': 'it_CH.ISO8859-1',
'it_it': 'it_IT.ISO8859-1',
'italian': 'it_IT.ISO8859-1',
'iu': 'iu_CA.NUNACOM-8',
'iu_ca': 'iu_CA.NUNACOM-8',
'iu_ca.nunacom8': 'iu_CA.NUNACOM-8',
'iw': 'he_IL.ISO8859-8',
'iw_il': 'he_IL.ISO8859-8',
'iw_il.utf8': 'iw_IL.UTF-8',
'ja': 'ja_JP.eucJP',
'ja_jp': 'ja_JP.eucJP',
'ja_jp.euc': 'ja_JP.eucJP',
'ja_jp.mscode': 'ja_JP.SJIS',
'ja_jp.pck': 'ja_JP.SJIS',
'japan': 'ja_JP.eucJP',
'japanese': 'ja_JP.eucJP',
'japanese-euc': 'ja_JP.eucJP',
'japanese.euc': 'ja_JP.eucJP',
'jp_jp': 'ja_JP.eucJP',
'ka': 'ka_GE.GEORGIAN-ACADEMY',
'ka_ge': 'ka_GE.GEORGIAN-ACADEMY',
'ka_ge.georgianacademy': 'ka_GE.GEORGIAN-ACADEMY',
'ka_ge.georgianps': 'ka_GE.GEORGIAN-PS',
'ka_ge.georgianrs': 'ka_GE.GEORGIAN-ACADEMY',
'kk_kz': 'kk_KZ.RK1048',
'kl': 'kl_GL.ISO8859-1',
'kl_gl': 'kl_GL.ISO8859-1',
'km_kh': 'km_KH.UTF-8',
'kn': 'kn_IN.UTF-8',
'kn_in': 'kn_IN.UTF-8',
'ko': 'ko_KR.eucKR',
'ko_kr': 'ko_KR.eucKR',
'ko_kr.euc': 'ko_KR.eucKR',
'kok_in': 'kok_IN.UTF-8',
'korean': 'ko_KR.eucKR',
'korean.euc': 'ko_KR.eucKR',
'ks': 'ks_IN.UTF-8',
'ks_in': 'ks_IN.UTF-8',
'ks_in@devanagari.utf8': 'ks_IN.UTF-8@devanagari',
'ku_tr': 'ku_TR.ISO8859-9',
'kw': 'kw_GB.ISO8859-1',
'kw_gb': 'kw_GB.ISO8859-1',
'ky': 'ky_KG.UTF-8',
'ky_kg': 'ky_KG.UTF-8',
'lb_lu': 'lb_LU.UTF-8',
'lg_ug': 'lg_UG.ISO8859-10',
'li_be': 'li_BE.UTF-8',
'li_nl': 'li_NL.UTF-8',
'lij_it': 'lij_IT.UTF-8',
'lithuanian': 'lt_LT.ISO8859-13',
'lo': 'lo_LA.MULELAO-1',
'lo_la': 'lo_LA.MULELAO-1',
'lo_la.cp1133': 'lo_LA.IBM-CP1133',
'lo_la.ibmcp1133': 'lo_LA.IBM-CP1133',
'lo_la.mulelao1': 'lo_LA.MULELAO-1',
'lt': 'lt_LT.ISO8859-13',
'lt_lt': 'lt_LT.ISO8859-13',
'lv': 'lv_LV.ISO8859-13',
'lv_lv': 'lv_LV.ISO8859-13',
'mag_in': 'mag_IN.UTF-8',
'mai': 'mai_IN.UTF-8',
'mai_in': 'mai_IN.UTF-8',
'mg_mg': 'mg_MG.ISO8859-15',
'mhr_ru': 'mhr_RU.UTF-8',
'mi': 'mi_NZ.ISO8859-1',
'mi_nz': 'mi_NZ.ISO8859-1',
'mk': 'mk_MK.ISO8859-5',
'mk_mk': 'mk_MK.ISO8859-5',
'ml': 'ml_IN.UTF-8',
'ml_in': 'ml_IN.UTF-8',
'mn_mn': 'mn_MN.UTF-8',
'mni_in': 'mni_IN.UTF-8',
'mr': 'mr_IN.UTF-8',
'mr_in': 'mr_IN.UTF-8',
'ms': 'ms_MY.ISO8859-1',
'ms_my': 'ms_MY.ISO8859-1',
'mt': 'mt_MT.ISO8859-3',
'mt_mt': 'mt_MT.ISO8859-3',
'my_mm': 'my_MM.UTF-8',
'nan_tw@latin': 'nan_TW.UTF-8@latin',
'nb': 'nb_NO.ISO8859-1',
'nb_no': 'nb_NO.ISO8859-1',
'nds_de': 'nds_DE.UTF-8',
'nds_nl': 'nds_NL.UTF-8',
'ne_np': 'ne_NP.UTF-8',
'nhn_mx': 'nhn_MX.UTF-8',
'niu_nu': 'niu_NU.UTF-8',
'niu_nz': 'niu_NZ.UTF-8',
'nl': 'nl_NL.ISO8859-1',
'nl_aw': 'nl_AW.UTF-8',
'nl_be': 'nl_BE.ISO8859-1',
'nl_nl': 'nl_NL.ISO8859-1',
'nn': 'nn_NO.ISO8859-1',
'nn_no': 'nn_NO.ISO8859-1',
'no': 'no_NO.ISO8859-1',
'no@nynorsk': 'ny_NO.ISO8859-1',
'no_no': 'no_NO.ISO8859-1',
'no_no.iso88591@bokmal': 'no_NO.ISO8859-1',
'no_no.iso88591@nynorsk': 'no_NO.ISO8859-1',
'norwegian': 'no_NO.ISO8859-1',
'nr': 'nr_ZA.ISO8859-1',
'nr_za': 'nr_ZA.ISO8859-1',
'nso': 'nso_ZA.ISO8859-15',
'nso_za': 'nso_ZA.ISO8859-15',
'ny': 'ny_NO.ISO8859-1',
'ny_no': 'ny_NO.ISO8859-1',
'nynorsk': 'nn_NO.ISO8859-1',
'oc': 'oc_FR.ISO8859-1',
'oc_fr': 'oc_FR.ISO8859-1',
'om_et': 'om_ET.UTF-8',
'om_ke': 'om_KE.ISO8859-1',
'or': 'or_IN.UTF-8',
'or_in': 'or_IN.UTF-8',
'os_ru': 'os_RU.UTF-8',
'pa': 'pa_IN.UTF-8',
'pa_in': 'pa_IN.UTF-8',
'pa_pk': 'pa_PK.UTF-8',
'pap_an': 'pap_AN.UTF-8',
'pd': 'pd_US.ISO8859-1',
'pd_de': 'pd_DE.ISO8859-1',
'pd_us': 'pd_US.ISO8859-1',
'ph': 'ph_PH.ISO8859-1',
'ph_ph': 'ph_PH.ISO8859-1',
'pl': 'pl_PL.ISO8859-2',
'pl_pl': 'pl_PL.ISO8859-2',
'polish': 'pl_PL.ISO8859-2',
'portuguese': 'pt_PT.ISO8859-1',
'portuguese_brazil': 'pt_BR.ISO8859-1',
'posix': 'C',
'posix-utf2': 'C',
'pp': 'pp_AN.ISO8859-1',
'pp_an': 'pp_AN.ISO8859-1',
'ps_af': 'ps_AF.UTF-8',
'pt': 'pt_PT.ISO8859-1',
'pt_br': 'pt_BR.ISO8859-1',
'pt_pt': 'pt_PT.ISO8859-1',
'ro': 'ro_RO.ISO8859-2',
'ro_ro': 'ro_RO.ISO8859-2',
'romanian': 'ro_RO.ISO8859-2',
'ru': 'ru_RU.UTF-8',
'ru_ru': 'ru_RU.UTF-8',
'ru_ua': 'ru_UA.KOI8-U',
'rumanian': 'ro_RO.ISO8859-2',
'russian': 'ru_RU.ISO8859-5',
'rw': 'rw_RW.ISO8859-1',
'rw_rw': 'rw_RW.ISO8859-1',
'sa_in': 'sa_IN.UTF-8',
'sat_in': 'sat_IN.UTF-8',
'sc_it': 'sc_IT.UTF-8',
'sd': 'sd_IN.UTF-8',
'sd_in': 'sd_IN.UTF-8',
'sd_in@devanagari.utf8': 'sd_IN.UTF-8@devanagari',
'sd_pk': 'sd_PK.UTF-8',
'se_no': 'se_NO.UTF-8',
'serbocroatian': 'sr_RS.UTF-8@latin',
'sh': 'sr_RS.UTF-8@latin',
'sh_ba.iso88592@bosnia': 'sr_CS.ISO8859-2',
'sh_hr': 'sh_HR.ISO8859-2',
'sh_hr.iso88592': 'hr_HR.ISO8859-2',
'sh_sp': 'sr_CS.ISO8859-2',
'sh_yu': 'sr_RS.UTF-8@latin',
'shs_ca': 'shs_CA.UTF-8',
'si': 'si_LK.UTF-8',
'si_lk': 'si_LK.UTF-8',
'sid_et': 'sid_ET.UTF-8',
'sinhala': 'si_LK.UTF-8',
'sk': 'sk_SK.ISO8859-2',
'sk_sk': 'sk_SK.ISO8859-2',
'sl': 'sl_SI.ISO8859-2',
'sl_cs': 'sl_CS.ISO8859-2',
'sl_si': 'sl_SI.ISO8859-2',
'slovak': 'sk_SK.ISO8859-2',
'slovene': 'sl_SI.ISO8859-2',
'slovenian': 'sl_SI.ISO8859-2',
'so_dj': 'so_DJ.ISO8859-1',
'so_et': 'so_ET.UTF-8',
'so_ke': 'so_KE.ISO8859-1',
'so_so': 'so_SO.ISO8859-1',
'sp': 'sr_CS.ISO8859-5',
'sp_yu': 'sr_CS.ISO8859-5',
'spanish': 'es_ES.ISO8859-1',
'spanish_spain': 'es_ES.ISO8859-1',
'sq': 'sq_AL.ISO8859-2',
'sq_al': 'sq_AL.ISO8859-2',
'sq_mk': 'sq_MK.UTF-8',
'sr': 'sr_RS.UTF-8',
'sr@cyrillic': 'sr_RS.UTF-8',
'sr@latn': 'sr_CS.UTF-8@latin',
'sr_cs': 'sr_CS.UTF-8',
'sr_cs.iso88592@latn': 'sr_CS.ISO8859-2',
'sr_cs@latn': 'sr_CS.UTF-8@latin',
'sr_me': 'sr_ME.UTF-8',
'sr_rs': 'sr_RS.UTF-8',
'sr_rs@latn': 'sr_RS.UTF-8@latin',
'sr_sp': 'sr_CS.ISO8859-2',
'sr_yu': 'sr_RS.UTF-8@latin',
'sr_yu.cp1251@cyrillic': 'sr_CS.CP1251',
'sr_yu.iso88592': 'sr_CS.ISO8859-2',
'sr_yu.iso88595': 'sr_CS.ISO8859-5',
'sr_yu.iso88595@cyrillic': 'sr_CS.ISO8859-5',
'sr_yu.microsoftcp1251@cyrillic': 'sr_CS.CP1251',
'sr_yu.utf8': 'sr_RS.UTF-8',
'sr_yu.utf8@cyrillic': 'sr_RS.UTF-8',
'sr_yu@cyrillic': 'sr_RS.UTF-8',
'ss': 'ss_ZA.ISO8859-1',
'ss_za': 'ss_ZA.ISO8859-1',
'st': 'st_ZA.ISO8859-1',
'st_za': 'st_ZA.ISO8859-1',
'sv': 'sv_SE.ISO8859-1',
'sv_fi': 'sv_FI.ISO8859-1',
'sv_se': 'sv_SE.ISO8859-1',
'sw_ke': 'sw_KE.UTF-8',
'sw_tz': 'sw_TZ.UTF-8',
'swedish': 'sv_SE.ISO8859-1',
'szl_pl': 'szl_PL.UTF-8',
'ta': 'ta_IN.TSCII-0',
'ta_in': 'ta_IN.TSCII-0',
'ta_in.tscii': 'ta_IN.TSCII-0',
'ta_in.tscii0': 'ta_IN.TSCII-0',
'ta_lk': 'ta_LK.UTF-8',
'te': 'te_IN.UTF-8',
'te_in': 'te_IN.UTF-8',
'tg': 'tg_TJ.KOI8-C',
'tg_tj': 'tg_TJ.KOI8-C',
'th': 'th_TH.ISO8859-11',
'th_th': 'th_TH.ISO8859-11',
'th_th.tactis': 'th_TH.TIS620',
'th_th.tis620': 'th_TH.TIS620',
'thai': 'th_TH.ISO8859-11',
'ti_er': 'ti_ER.UTF-8',
'ti_et': 'ti_ET.UTF-8',
'tig_er': 'tig_ER.UTF-8',
'tk_tm': 'tk_TM.UTF-8',
'tl': 'tl_PH.ISO8859-1',
'tl_ph': 'tl_PH.ISO8859-1',
'tn': 'tn_ZA.ISO8859-15',
'tn_za': 'tn_ZA.ISO8859-15',
'tr': 'tr_TR.ISO8859-9',
'tr_cy': 'tr_CY.ISO8859-9',
'tr_tr': 'tr_TR.ISO8859-9',
'ts': 'ts_ZA.ISO8859-1',
'ts_za': 'ts_ZA.ISO8859-1',
'tt': 'tt_RU.TATAR-CYR',
'tt_ru': 'tt_RU.TATAR-CYR',
'tt_ru.tatarcyr': 'tt_RU.TATAR-CYR',
'tt_ru@iqtelif': 'tt_RU.UTF-8@iqtelif',
'turkish': 'tr_TR.ISO8859-9',
'ug_cn': 'ug_CN.UTF-8',
'uk': 'uk_UA.KOI8-U',
'uk_ua': 'uk_UA.KOI8-U',
'univ': 'en_US.utf',
'universal': 'en_US.utf',
'universal.utf8@ucs4': 'en_US.UTF-8',
'unm_us': 'unm_US.UTF-8',
'ur': 'ur_PK.CP1256',
'ur_in': 'ur_IN.UTF-8',
'ur_pk': 'ur_PK.CP1256',
'uz': 'uz_UZ.UTF-8',
'uz_uz': 'uz_UZ.UTF-8',
'uz_uz@cyrillic': 'uz_UZ.UTF-8',
've': 've_ZA.UTF-8',
've_za': 've_ZA.UTF-8',
'vi': 'vi_VN.TCVN',
'vi_vn': 'vi_VN.TCVN',
'vi_vn.tcvn': 'vi_VN.TCVN',
'vi_vn.tcvn5712': 'vi_VN.TCVN',
'vi_vn.viscii': 'vi_VN.VISCII',
'vi_vn.viscii111': 'vi_VN.VISCII',
'wa': 'wa_BE.ISO8859-1',
'wa_be': 'wa_BE.ISO8859-1',
'wae_ch': 'wae_CH.UTF-8',
'wal_et': 'wal_ET.UTF-8',
'wo_sn': 'wo_SN.UTF-8',
'xh': 'xh_ZA.ISO8859-1',
'xh_za': 'xh_ZA.ISO8859-1',
'yi': 'yi_US.CP1255',
'yi_us': 'yi_US.CP1255',
'yo_ng': 'yo_NG.UTF-8',
'yue_hk': 'yue_HK.UTF-8',
'zh': 'zh_CN.eucCN',
'zh_cn': 'zh_CN.gb2312',
'zh_cn.big5': 'zh_TW.big5',
'zh_cn.euc': 'zh_CN.eucCN',
'zh_hk': 'zh_HK.big5hkscs',
'zh_hk.big5hk': 'zh_HK.big5hkscs',
'zh_sg': 'zh_SG.GB2312',
'zh_sg.gbk': 'zh_SG.GBK',
'zh_tw': 'zh_TW.big5',
'zh_tw.euc': 'zh_TW.eucTW',
'zh_tw.euctw': 'zh_TW.eucTW',
'zu': 'zu_ZA.ISO8859-1',
'zu_za': 'zu_ZA.ISO8859-1'}
1.locale列处理
import locale
from collections import defaultdict
localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
localeIdMap[l] = i + 1
for each in localeIdMap:
print(each, '\t', localeIdMap[each])
ee 1
fr_ch 2
fo_fo 3
af_za 4
bn_in 5
mni_in 93
da_dk 8
ar_ma 9
ig_ng 369
fr_be 11
italian 12
he_il 13
aa_dj 15
ml 463
yue_hk 17
pt_br 19
es_mx 280
gu_in 22
sid_et 23
it_it 24
japanese 95
de_de 26
en_ag 523
croatian 27
it 96
cs 29
mn_mn 30
ar_bh 31
ro_ro 481
gv_gb 33
rw 34
bg_bg 35
ar 499
en_us@euro@euro 36
fil_ph 37
fr_fr 466
french 39
de 40
polish 285
kok_in 42
korean.euc 43
sr 44
sr_cs.iso88592@latn 45
pap_an 46
sr_yu.iso88595 47
turkish 51
c.utf8 52
uz_uz 53
lv_lv 429
sr_rs@latn 54
eo_xx 55
ik_ca 57
iso_8859_1 58
no_no.iso88591@bokmal 59
cextend 60
doi_in 225
universal 61
es_cr 62
hne_in 63
gd_gb 64
cy 65
nl_aw 66
yi 67
mt_mt 68
sk_sk 384
si_lk 71
a3_az 72
lt 500
st_za 73
iw 74
te 318
en_nz 528
en_in 76
zh_tw.euc 77
ne_np 49
brx_in 286
no 80
az 81
german.iso88591 475
ky 32
he 85
kn_in 86
id_id 110
mai 88
nb_no 89
czech 90
sq 91
ja 92
tr 6
german_germany 94
shs_ca 265
mr 28
fi_fi 97
wal_et 48
cs_cs 100
sd_in@devanagari.utf8 101
gez_er 102
a3 103
wae_ch 283
iu 106
nl 107
french.iso88591 108
japanese-euc 83
tig_er 98
hne 111
c.iso88591 112
ar_qa 113
chinese-t 114
fo 115
de_li.utf8 117
br_fr 118
mag_in 515
sv_fi 119
russian 120
pp 121
wa_be 123
norwegian 124
fa_ir.isiri3342 126
ky_kg 127
zh_tw.euctw 128
fre_fr 130
english_uk 131
arabic 133
fr_ca 134
ber_ma 135
ml_in 136
li_nl 137
et 138
fur_it 139
om_ke 140
gl 141
bg 142
is_is 143
sr_yu 282
tk_tm 125
en_au 146
fa_ir 147
be_bg.utf8 148
zu 303
sh_hr.iso88592 150
szl_pl 310
ar_ae 152
nynorsk 153
en_bw 154
iso-8859-1 155
tl_ph 518
bulgarian 299
ts 356
kn 159
af 160
wa 161
or_in 162
dansk 163
bs 522
be@latin 164
lij_it 398
ko_kr 167
tr_tr 168
ar_in 169
os_ru 170
sr_yu@cyrillic 171
ta_lk 172
sr_rs 490
es_ec 174
en_be 175
no_no.iso88591@nynorsk 176
zh_cn.big5 177
pt_pt 178
an_es 179
zh_hk 180
es_cl 181
unm_us 312
am 183
as 184
cv_ru 185
ar_aa 186
gd 419
ti_er 187
ar_lb 188
sp 189
ja_jp.euc 190
csb_pl 191
el_gr 192
de_be 193
bokmål 194
danish 195
be_by@latin 196
kw 198
iso_8859_15 301
sr_yu.iso88595@cyrillic 199
cs_cz 200
tn 201
ar_tn 202
or 203
se_no 204
mhr_ru 495
be_by 206
eu_fr 406
de_at 207
tr_cy 104
mai_in 209
zu_za 210
sh_hr 211
ta_in.tscii 212
sr_yu.utf8 213
de_ch 214
dv_mv 236
mk 215
mt 216
fa 217
tt_ru 218
ga_ie 306
iw_il 219
li_be 220
ka_ge.georgianacademy 221
az_az.iso88599e 222
eng_gb 223
en_zw 224
en_dl.utf8 75
estonian 226
es_pa 227
sw_ke 228
es_pe 229
pa_pk 230
hebrew 231
niu_nu 232
lo_la 233
ca_es 309
sq_al 235
ka_ge.georgianrs 305
ca 238
tt_ru.tatarcyr 239
zh_hk.big5hk 240
nb 241
mg_mg 242
eo_eo 510
kl_gl 411
lo 244
iu_ca 245
thai 517
as_in 246
en_ng 313
ar_om 248
ia 249
eo_us.utf8 250
ur_pk 251
vi_vn.tcvn 252
ar_eg 253
es_py 254
ru_ua 255
nn 256
hr 504
chinese-s 258
sc_it 259
ta_in.tscii0 260
korean 261
nr_za 262
si 263
zh_sg 264
portuguese_brazil 440
bokmal 482
ber_dz 266
pa 316
ee_ee 526
american 268
en_za 269
lo_la.cp1133 270
pa_in 271
en_uk 272
sat_in 273
so_so 274
finnish 275
cy_gb 277
mi_nz 278
gez_et 279
german 20
am_et 281
ko_kr.euc 543
es_cu 144
sd 69
ti_et 156
en_ca 506
sr_yu.microsoftcp1251@cyrillic 87
c.ascii 402
lv 287
ka_ge.georgianps 288
pl_pl 237
ar_kw 290
hrvatski 7
bo_in 292
dutch.iso88591 293
pd_de 294
in_id 296
ms 297
hsb_de 298
sr_yu.utf8@cyrillic 157
th_th.tis620 300
lb_lu 315
lg_ug 302
uz_uz@cyrillic 304
sh_sp 314
tg_tj 129
ku_tr 307
deutsch 105
ar_ly 536
nds_nl 390
my_mm 308
fy_nl 234
aa_er 151
kw_gb 311
hy_am 247
romanian 267
wo_sn 122
so_ke 320
sr_yu.iso88592 322
pl 295
sp_yu 324
be 325
et_ee 326
en_ie 328
es_do 329
en_sg 330
it_ch 331
bs_ba 332
el_gr@euro 333
sinhala 334
hu 335
tt_ru@iqtelif 336
ger_de 337
iu_ca.nunacom8 78
ph_ph 339
en_ph 469
rw_rw 393
so_et 340
ka 341
ur_in 205
hr_hr 343
ar_sa 344
french_france 345
sk 346
es_pr 347
galician 349
ff_sn 350
sq_mk 56
ny_no 352
ro 353
zh_cn 354
tt 355
nhn_mx 427
en_dk 372
ar_iq 358
lt_lt 359
dutch 360
slovenian 361
cz 362
nso_za 508
cz_cz 428
ss 364
ar_sy 365
en_gb 366
byn_er 367
ayc_pe 368
en_zw.utf8 338
ug_cn 14
es_ni 371
catalan 84
english_us 373
hi_in.isciidev 374
eu_es 422
ca_fr 375
vi_vn.tcvn5712 376
so_dj 50
nl_nl 378
en_zm 379
posix-utf2 380
el 525
lo_la.ibmcp1133 382
en 383
th_th 70
ka_ge 385
kk_kz 386
a3_az.koic 387
fr 388
de_lu 389
zh 21
es_gt 542
oc_fr 391
ta 392
sv_se 116
st 10
galego 395
eu 158
sr_sp 529
sr_yu.cp1251@cyrillic 166
es_ar 400
mk_mk 401
english_united-states.437 18
dz_bt 351
ga 432
en_us 404
ar_jo 405
es_uy 342
tl 407
c-french 408
english_united-states 409
en_hk 410
br 478
nso 243
spanish_spain 412
xh 413
yi_us 414
ps_af 415
zh_tw 416
bho_in 417
ia_fr 435
ss_za 418
gv 291
es_bo 420
eo 491
gl_es 421
ja_jp 319
tn_za 423
crh_ua 424
sw_tz 425
jp_jp 426
sh_ba.iso88592@bosnia 357
km_kh 363
sv 399
no@nynorsk 16
vi 403
hy_am.armscii8 433
ru_ru 434
univ 276
mr_in 436
ur 437
ht_ht 438
japan 439
sh 377
fr_lu 441
es_hn 442
ast_es 443
ta_in 444
sd_pk 445
portuguese 446
ts_za 447
mi 448
lithuanian 488
c.en 450
zh_cn.euc 321
az_az 452
ko 537
sr@latn 454
es_us 455
ny 456
is 182
iso8859-1 431
fy_de 197
oc 459
icelandic 460
es_es 461
greek 462
pp_an 284
da 464
ha_ng 465
ks_in@devanagari.utf8 38
el_cy 512
pd_us 467
th 468
ja_jp.pck 149
ru 470
c 396
ca_es@valencia 458
uk 472
rumanian 473
français 474
ja_jp.mscode 82
tg 476
es_sv 477
japanese.euc 99
ca_it 479
c_c.c 25
english 480
es_ve 394
kl 483
ve 484
sr_cs@latn 485
ar_dz 486
aa_et 487
bo_cn 109
iw_il.utf8 145
nn_no 489
vi_vn 173
spanish 79
ca_ad 492
vi_vn.viscii111 494
c_c 451
nan_tw@latin 370
ar_sd 498
vi_vn.viscii 496
ms_my 501
es_co 502
posix 503
niu_nz 257
ks 505
id 430
iso-8859-15 507
sd_in 327
es 509
th_th.tactis 41
iso8859-15 471
bn_bd 511
hu_hu 323
nds_de 513
nr 514
slovene 208
sl_si 516
ve_za 317
sh_yu 545
sr@cyrillic 519
slovak 521
pd 497
serbocroatian 132
ph 457
sa_in 381
fi 348
nl_be 527
sr_me 165
swedish 397
sl_cs 530
ar_ye 524
yo_ng 531
eesti 532
hungarian 533
no_no 534
hi 548
uz 535
in 449
om_et 453
sr_cs 538
xh_za 539
pt 541
universal.utf8@ucs4 520
ks_in 493
bem_zm 544
hi_in 289
eo.utf8 546
uk_ua 547
zh_sg.gbk 540
te_in 549
sl 550
lo_la.mulelao1 551
所以传给localeIdMap一个locale的字符串,就可以将其转换成数值型,如果传入的字符串不在localeIdMap的key中,则返回0,这也就体现了defaultdict(int)的作用
print(localeIdMap['en_GB'.lower()])
print(localeIdMap['en_US'.lower()])
print(localeIdMap['id_ID'.lower()])
print(localeIdMap['ka_GE'.lower()])
366
404
110
385
2.birthyear列处理
该列处理比较简单,存在就直接转换成数值,不存在就用0填充
def getBirthYearInt(birthYear):
try:
return 0 if birthYear=="None" else int(birthYear)
except:
return 0
print(getBirthYearInt(1992))
print(getBirthYearInt(None))
1992
0
3.gender列处理
male转换为1, female转换为2,空值用0填充
from collections import defaultdict
genderIdMap = defaultdict(int, {'male':1, 'female':2})
print(genderIdMap['male'])
print(genderIdMap['female'])
print(genderIdMap[None])
1
2
0
4.joinedAt列处理
我们发现该列信息有些共性特点:
import pandas as pd
df_users = pd.read_csv('users.csv')
df_users['joinedAt'][:10]
0 2012-10-02T06:40:55.524Z
1 2012-09-29T18:03:12.111Z
2 2012-10-06T03:14:07.149Z
3 2012-11-04T08:59:43.783Z
4 2012-09-10T16:06:53.132Z
5 2012-11-01T09:59:17.590Z
6 2012-10-03T05:22:17.637Z
7 2012-10-03T12:19:29.975Z
8 2012-10-31T10:11:57.668Z
9 2012-10-02T07:28:09.555Z
Name: joinedAt, dtype: object
我们发现该列要么是None,要么是上面的时间字符串,均有T在中间和S在尾部,根据这个共性我们用datetime模块,提取时间信息:
import datetime
def getJoinedYearMonth(dateString):
try:
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month)] )
except:
return 0
df_users['joinedAt'].map(getJoinedYearMonth)[:10]
0 201210
1 20129
2 201210
3 201211
4 20129
5 201211
6 201210
7 201210
8 201210
9 201210
Name: joinedAt, dtype: object
5.location列处理
我们来看看users.csv中location列信息(前20行):
df_users['location'][:20]
0 Medan Indonesia
1 Medan Indonesia
2 Stratford Ontario
3 Tehran Iran
4 NaN
5 Tbilisi Georgia
6 Medan Indonesia
7 Medan Indonesia
8 Medan Indonesia
9 Medan Indonesia
10 Medan Indonesia
11 Phnom Penh
12 Djokja Yogyakarta Indonesia
13 Triolet Mauritius
14 NaN
15 NaN
16 NaN
17 Surabaya Indonesia
18 Medan Indonesia
19 NaN
Name: location, dtype: object
我们使用pycountry模块来将此列转换为数值型,pycountry.countries是个迭代器:
import pycountry
from collections import defaultdict
countryIdMap = defaultdict(int)
for i, c in enumerate(pycountry.countries):
countryIdMap[c.name.lower()] = i + 1
#将地址信息转换为数值型
def getCountryId(location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0
print(getCountryId('San Dimas California'))
print(getCountryId('Jogjakarta Indonesia'))
0
103
我们知道许多机器学习模型只能接受数值型的数据作为模型的输入,所以在这里需要将位置信息转换为数值型的数据,常见的做法是对其做one hot处理,但是这样会造成矩阵太稀疏,我们可以使用pycountry库,对位置数据按照pycountry中存储的位置信息进行编码,使用编码来代替原始的位置信息。
6.timezone列处理
比较简单,存在值就转换为int型,不存在用0填充
def getTimezoneInt(timezone):
try:
return int(timezone)
except:
return 0
print(getTimezoneInt(-240))#-240
print(getTimezoneInt(240))
print(getTimezoneInt(None))
-240
240
0
7.将上面处理的1-6列进行归一化
self.userMatrix矩阵的处理中归一化使用了sklearn.preprocessing.normalize()函数,归一化后方便计算两个user的相似度
这里只计算Event Recommendation Engine Challenge分步解析第一步中的uniqueUserPairs,他们因为同一个event事件关联起来了,有联系
计算相关性用到了scipy.spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数(pearson correlation coefficient, Centered Cosine)
#第二步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
#构建用户-事件矩阵类
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] )
uniqueEvents.add( cols[1] )
eventsForUser[cols[0]].add( cols[1] )
usersForEvent[cols[1]].add( cols[0] )
f.close()
self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores)
#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
#数据清洗类
class DataCleaner:
def __init__(self):
#一些字符串转数值的方法
#载入locale
self.localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1
#载入country
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == 'usa':
ctryIdx['US'] = i
if c.name.lower() == 'canada':
ctryIdx['CA'] = i
for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
#处理性别信息
self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
#处理LocaleId
def getLocaleId(self, locstr):
#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
return self.localeIdMap[ locstr.lower() ]
#处理birthyear
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == 'None' else int(birthYear)
except:
return 0
#性别处理
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr]
#joinedAt
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month) ] )
#处理location
def getCountryId(self, location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0
#处理timezone
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0
#构建用户和用户之间的相似矩阵类
class Users:
"""
构建user/user相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())#3391
#print(nusers)
fin = open('users.csv')
colnames = fin.readline().strip().split(',') #7列特征
self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
for line in fin:
cols = line.strip().split(',')
#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
#构造用户矩阵(将原始数据中的数据进行处理后构建)
if cols[0] in programEntities.userIndex:
i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
fin.close()
#归一化矩阵
self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('US_userMatrix', self.userMatrix)
#计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0
for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1] #获取用户u1的索引
j = programEntities.userIndex[u2]
if (i, j) not in self.userSimMatrix:
#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense()) #计算两个用户向量之间的相似性,为对称矩阵
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')
第1步:统计user和event相关信息...
第1步完成...
第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...
第三步:用户社交关系信息处理
这一步需要user_friends.csv.gz文件,我们先来看看文件内容:
import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()
user | friends | |
---|---|---|
0 | 3197468391 | 1346449342 3873244116 4226080662 1222907620 54... |
1 | 3537982273 | 1491560444 395798035 2036380346 899375619 3534... |
2 | 823183725 | 1484954627 1950387873 1652977611 4185960823 42... |
3 | 1872223848 | 83361640 723814682 557944478 1724049724 253059... |
4 | 3429017717 | 4253303705 2130310957 1838389374 3928735761 71... |
- 1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
- 2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
# 第三步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
import gzip
import numpy as np
#处理user和event关联数据
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] )
uniqueEvents.add( cols[1] )
eventsForUser[cols[0]].add( cols[1] )
usersForEvent[cols[1]].add( cols[0] )
f.close()
self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores)
#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
#数据清洗类
class DataCleaner:
def __init__(self):
#一些字符串转数值的方法
#载入locale
self.localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1
#载入country
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == 'usa':
ctryIdx['US'] = i
if c.name.lower() == 'canada':
ctryIdx['CA'] = i
for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
#处理LocaleId
def getLocaleId(self, locstr):
#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
return self.localeIdMap[ locstr.lower() ]
#处理birthyear
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == 'None' else int(birthYear)
except:
return 0
#性别处理
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr]
#joinedAt
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month) ] )
#处理location
def getCountryId(self, location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0
#处理timezone
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0
#用户与用户相似度矩阵
class Users:
"""
构建user/user相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())#3391
#print(nusers)
fin = open('users.csv')
colnames = fin.readline().strip().split(',') #7列特征
self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
for line in fin:
cols = line.strip().split(',')
#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
if cols[0] in programEntities.userIndex:
i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
fin.close()
#归一化矩阵
self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('US_userMatrix', self.userMatrix)
#计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0
for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1]
j = programEntities.userIndex[u2]
if (i, j) not in self.userSimMatrix:
#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
#用户社交关系挖掘
class UserFriends:
"""
找出某用户的那些朋友,想法非常简单
1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
"""
def __init__(self, programEntities):
nusers = len(programEntities.userIndex.keys())#3391 用户数目
self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
self.userFriends = ss.dok_matrix( (nusers, nusers) ) #记录下每个用户的朋友点击事件的次数
fin = gzip.open('user_friends.csv.gz')
print( 'Header In User_friends.csv.gz:',fin.readline() )
ln = 0
#逐行打开user_friends.csv.gz文件
#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
#获取该用户的Index,和朋友数目
#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
#score即为该朋友对所有events的平均分
#userFriends矩阵记录了用户和朋友之间的score
#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
for line in fin:
if ln % 200 == 0:
print( 'Loading line:', ln )
cols = line.decode().strip().split(',')
user = cols[0]
if user in programEntities.userIndex:
friends = cols[1].split(' ')#获得该用户的朋友列表
i = programEntities.userIndex[user]
self.numFriends[i] = len(friends)
for friend in friends:
if friend in programEntities.userIndex:
j = programEntities.userIndex[friend]
#the objective of this score is to infer the degree to
#and direction in which this friend will influence the
#user's decision, so we sum the user/event score for
#this user across all training events
eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
#socre即是用户朋友在13418个events上的平均分
score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
#print(score)
self.userFriends[i, j] += score
self.userFriends[j, i] += score
ln += 1
fin.close()
#归一化数组
sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
print(sumNumFriends)
self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) ) #将用户-朋友数矩阵保存
self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
sio.mmwrite('UF_userFriends', self.userFriends) #将用户-朋友事件点击矩阵保存
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
第1步:统计user和event相关信息...
第1步完成...
第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...
第3步:计算用户社交关系信息,并存储...
Header In User_friends.csv.gz: b'user,friends\n'
Loading line: 0
Loading line: 200
Loading line: 400
Loading line: 600
Loading line: 800
Loading line: 1000
Loading line: 1200
Loading line: 1400
Loading line: 1600
Loading line: 1800
Loading line: 2000
Loading line: 2200
Loading line: 2400
Loading line: 2600
Loading line: 2800
Loading line: 3000
Loading line: 3200
Loading line: 3400
Loading line: 3600
Loading line: 3800
Loading line: 4000
Loading line: 4200
Loading line: 4400
Loading line: 4600
Loading line: 4800
Loading line: 5000
Loading line: 5200
Loading line: 5400
Loading line: 5600
Loading line: 5800
Loading line: 6000
Loading line: 6200
Loading line: 6400
Loading line: 6600
Loading line: 6800
Loading line: 7000
Loading line: 7200
Loading line: 7400
Loading line: 7600
Loading line: 7800
Loading line: 8000
Loading line: 8200
Loading line: 8400
Loading line: 8600
Loading line: 8800
Loading line: 9000
Loading line: 9200
Loading line: 9400
Loading line: 9600
Loading line: 9800
Loading line: 10000
Loading line: 10200
Loading line: 10400
Loading line: 10600
Loading line: 10800
Loading line: 11000
Loading line: 11200
Loading line: 11400
Loading line: 11600
Loading line: 11800
Loading line: 12000
Loading line: 12200
Loading line: 12400
Loading line: 12600
Loading line: 12800
Loading line: 13000
Loading line: 13200
Loading line: 13400
Loading line: 13600
Loading line: 13800
Loading line: 14000
Loading line: 14200
Loading line: 14400
Loading line: 14600
Loading line: 14800
Loading line: 15000
Loading line: 15200
Loading line: 15400
Loading line: 15600
Loading line: 15800
Loading line: 16000
Loading line: 16200
Loading line: 16400
Loading line: 16600
Loading line: 16800
Loading line: 17000
Loading line: 17200
Loading line: 17400
Loading line: 17600
Loading line: 17800
Loading line: 18000
Loading line: 18200
Loading line: 18400
Loading line: 18600
Loading line: 18800
Loading line: 19000
Loading line: 19200
Loading line: 19400
Loading line: 19600
Loading line: 19800
Loading line: 20000
Loading line: 20200
Loading line: 20400
Loading line: 20600
Loading line: 20800
Loading line: 21000
Loading line: 21200
Loading line: 21400
Loading line: 21600
Loading line: 21800
Loading line: 22000
Loading line: 22200
Loading line: 22400
Loading line: 22600
Loading line: 22800
Loading line: 23000
Loading line: 23200
Loading line: 23400
Loading line: 23600
Loading line: 23800
Loading line: 24000
Loading line: 24200
Loading line: 24400
Loading line: 24600
Loading line: 24800
Loading line: 25000
Loading line: 25200
Loading line: 25400
Loading line: 25600
Loading line: 25800
Loading line: 26000
Loading line: 26200
Loading line: 26400
Loading line: 26600
Loading line: 26800
Loading line: 27000
Loading line: 27200
Loading line: 27400
Loading line: 27600
Loading line: 27800
Loading line: 28000
Loading line: 28200
Loading line: 28400
Loading line: 28600
Loading line: 28800
Loading line: 29000
Loading line: 29200
Loading line: 29400
Loading line: 29600
Loading line: 29800
Loading line: 30000
Loading line: 30200
Loading line: 30400
Loading line: 30600
Loading line: 30800
Loading line: 31000
Loading line: 31200
Loading line: 31400
Loading line: 31600
Loading line: 31800
Loading line: 32000
Loading line: 32200
Loading line: 32400
Loading line: 32600
Loading line: 32800
Loading line: 33000
Loading line: 33200
Loading line: 33400
Loading line: 33600
Loading line: 33800
Loading line: 34000
Loading line: 34200
Loading line: 34400
Loading line: 34600
Loading line: 34800
Loading line: 35000
Loading line: 35200
Loading line: 35400
Loading line: 35600
Loading line: 35800
Loading line: 36000
Loading line: 36200
Loading line: 36400
Loading line: 36600
Loading line: 36800
Loading line: 37000
Loading line: 37200
Loading line: 37400
Loading line: 37600
Loading line: 37800
Loading line: 38000
Loading line: 38200
3731377.0
第3步完成...
第四步:构建event和event相似度数据
我们先看看events.csv.gz:
import pandas as pd
df_events_csv = pd.read_csv('events.csv.gz', compression='gzip')
df_events_csv.head()
event_id | user_id | start_time | city | state | zip | country | lat | lng | c_1 | ... | c_92 | c_93 | c_94 | c_95 | c_96 | c_97 | c_98 | c_99 | c_100 | c_other | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 684921758 | 3647864012 | 2012-10-31T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 2 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
1 | 244999119 | 3476440521 | 2012-11-03T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 |
2 | 3928440935 | 517514445 | 2012-11-05T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 |
3 | 2582345152 | 781585781 | 2012-10-30T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 |
4 | 1051165850 | 1016098580 | 2012-09-27T00:00:00.001Z | NaN | NaN | NaN | NaN | NaN | NaN | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 |
5 rows × 110 columns
对上面的信息进行数值转换
1.start_time列的信息使用 datetime库进行处理
2.city,state,zip,country列处理都利用了hashlib包:注意这里处理event信息的时候,只有那些出现在train.csv和test.csv中的event才会进入数值转换程序
import hashlib
def FeatureHash(value):
if len(value.strip()) == 0:
return -1
else:
return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4] ,16)
print(FeatureHash('Muaraenim'))#47294
print(FeatureHash('a test demo'))#4030
47294
4030
3.lat和lon列处理
空值用0.0填充,其他转换为自身的float型
def getFloatValue(self, value):
if len(value.strip()) == 0:
return 0.0
else:
return float(value)
4.c_1之后列(也就是第10列之后)处理
- 这里用了一个矩阵eventContMatrix来保存c_1到c_100列信息,但是没有用的c_other列
5.将eventPropMatrix和eventContMatrix矩阵归一化后进行文件保存
6.使用uniqueEventPairs来计算event pairs相似度
- 利用了scipy.spatial.distance的correlation和cosine方法
## 第四步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
import gzip
import numpy as np
import hashlib
#处理user和event关联数据
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] )
uniqueEvents.add( cols[1] )
eventsForUser[cols[0]].add( cols[1] )
usersForEvent[cols[1]].add( cols[0] )
f.close()
self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores)
#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
#数据清洗类
class DataCleaner:
def __init__(self):
#一些字符串转数值的方法
#载入locale
self.localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1
#载入country
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == 'usa':
ctryIdx['US'] = i
if c.name.lower() == 'canada':
ctryIdx['CA'] = i
for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
#处理LocaleId
def getLocaleId(self, locstr):
#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
return self.localeIdMap[ locstr.lower() ]
#处理birthyear
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == 'None' else int(birthYear)
except:
return 0
#性别处理
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr]
#joinedAt
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month) ] )
#处理location
def getCountryId(self, location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0
#处理timezone
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0
def getFeatureHash(self, value):
if len(value.strip()) == 0:
return -1
else:
#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
#TypeError: Unicode-objects must be encoded before hashing
return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
def getFloatValue(self, value):
if len(value.strip()) == 0:
return 0.0
else:
return float(value)
#用户与用户相似度矩阵
class Users:
"""
构建user/user相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())#3391
#print(nusers)
fin = open('users.csv')
colnames = fin.readline().strip().split(',') #7列特征
self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
for line in fin:
cols = line.strip().split(',')
#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
if cols[0] in programEntities.userIndex:
i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
fin.close()
#归一化矩阵
self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('US_userMatrix', self.userMatrix)
#计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0
for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1]
j = programEntities.userIndex[u2]
if (i, j) not in self.userSimMatrix:
#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
#用户社交关系挖掘
class UserFriends:
"""
找出某用户的那些朋友,想法非常简单
1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
"""
def __init__(self, programEntities):
nusers = len(programEntities.userIndex.keys())#3391
self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
self.userFriends = ss.dok_matrix( (nusers, nusers) )
fin = gzip.open('user_friends.csv.gz')
print( 'Header In User_friends.csv.gz:',fin.readline() )
ln = 0
#逐行打开user_friends.csv.gz文件
#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
#获取该用户的Index,和朋友数目
#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
#score即为该朋友对所有events的平均分
#userFriends矩阵记录了用户和朋友之间的score
#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
for line in fin:
if ln % 200 == 0:
print( 'Loading line:', ln )
cols = line.decode().strip().split(',')
user = cols[0]
if user in programEntities.userIndex:
friends = cols[1].split(' ')#获得该用户的朋友列表
i = programEntities.userIndex[user]
self.numFriends[i] = len(friends)
for friend in friends:
if friend in programEntities.userIndex:
j = programEntities.userIndex[friend]
#the objective of this score is to infer the degree to
#and direction in which this friend will influence the
#user's decision, so we sum the user/event score for
#this user across all training events
eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
#socre即是用户朋友在13418个events上的平均分
score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
#print(score)
self.userFriends[i, j] += score
self.userFriends[j, i] += score
ln += 1
fin.close()
#归一化数组
sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
#print(sumNumFriends)
self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
sio.mmwrite('UF_userFriends', self.userFriends)
#构造event和event相似度数据
class Events:
"""
构建event-event相似度,注意这里有2种相似度
1)由用户-event行为,类似协同过滤算出的相似度
2)由event本身的内容(event信息)计算出的event-event相似度
"""
def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
cleaner = DataCleaner()
fin = gzip.open('events.csv.gz')
fin.readline()#skip header
nevents = len(programEntities.eventIndex) #事件的数目
print(nevents)#13418
self.eventPropMatrix = ss.dok_matrix( (nevents, 7) ) #存储事件-前7列特征
self.eventContMatrix = ss.dok_matrix( (nevents, 100) ) #存储事件
ln = 0
for line in fin:
#if ln > 10:
#break
cols = line.decode().strip().split(',')
eventId = cols[0]
if eventId in programEntities.eventIndex:
i = programEntities.eventIndex[eventId]
self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
#将10-101列的属性进行统计排布
for j in range(9, 109):
self.eventContMatrix[i, j-9] = cols[j]
ln += 1
fin.close()
#对特征矩阵1进行归一化处理
self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
#对特征矩阵2进行规一划处理
self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
#calculate similarity between event pairs based on the two matrices
self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
self.eventContSim = ss.dok_matrix( (nevents, nevents) )
for e1, e2 in programEntities.uniqueEventPairs:
i = programEntities.eventIndex[e1]
j = programEntities.eventIndex[e2]
#计算前10列数据的相识度
if not ((i, j) in self.eventPropSim):
epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
self.eventPropSim[i, j] = epsim
self.eventPropSim[j, i] = epsim
#计算后面数据的相似度
if not ((i, j) in self.eventContSim):
ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
self.eventContSim[i, j] = ecsim
self.eventContSim[j, i] = ecsim
sio.mmwrite('EV_eventPropSim', self.eventPropSim)
sio.mmwrite('EV_eventContSim', self.eventContSim)
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
print('第4步:计算event相似度信息,并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')
第五步:活跃度/event热度数据
由于用到event_attendees.csv.gz文件,我们先看看该文件
import pandas as pd
df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip')
df_events_attendees.head()
event | yes | maybe | invited | no | |
---|---|---|---|---|---|
0 | 1159822043 | 1975964455 252302513 4226086795 3805886383 142... | 2733420590 517546982 1350834692 532087573 5831... | 1723091036 3795873583 4109144917 3560622906 31... | 3575574655 1077296663 |
1 | 686467261 | 2394228942 2686116898 1056558062 3792942231 41... | 1498184352 645689144 3770076778 331335845 4239... | 1788073374 733302094 1830571649 676508092 7081... | NaN |
2 | 1186208412 | NaN | 3320380166 3810793697 | 1379121209 440668682 | 1728988561 2950720854 |
3 | 2621578336 | NaN | NaN | NaN | NaN |
4 | 855842686 | 2406118796 3550897984 294255260 1125817077 109... | 2671721559 1761448345 2356975806 2666669465 10... | 1518670705 880919237 2326414227 2673818347 332... | 3500235232 |
## 第五步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
import gzip
import numpy as np
import hashlib
#处理user和event关联数据
class ProgramEntities:
"""
我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
经过统计:train和test中总共3391个users和13418个events
"""
def __init__(self):
#统计训练集中有多少独立的用户的events
uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
for filename in ['train.csv', 'test.csv']:
f = open(filename)
f.readline()#跳过第一行
for line in f:
cols = line.strip().split(',')
uniqueUsers.add( cols[0] )
uniqueEvents.add( cols[1] )
eventsForUser[cols[0]].add( cols[1] )
usersForEvent[cols[1]].add( cols[0] )
f.close()
self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
self.userIndex = dict()
self.eventIndex = dict()
for i, u in enumerate(uniqueUsers):
self.userIndex[u] = i
for i, e in enumerate(uniqueEvents):
self.eventIndex[e] = i
ftrain = open('train.csv')
ftrain.readline()
for line in ftrain:
cols = line.strip().split(',')
i = self.userIndex[ cols[0] ]
j = self.eventIndex[ cols[1] ]
self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
ftrain.close()
sio.mmwrite('PE_userEventScores', self.userEventScores)
#为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
#所谓关联用户指的是至少在同一个event上有行为的用户user pair
#关联的event指的是至少同一个user有行为的event pair
self.uniqueUserPairs = set()
self.uniqueEventPairs = set()
for event in uniqueEvents:
users = usersForEvent[event]
if len(users) > 2:
self.uniqueUserPairs.update( itertools.combinations(users, 2) )
for user in uniqueUsers:
events = eventsForUser[user]
if len(events) > 2:
self.uniqueEventPairs.update( itertools.combinations(events, 2) )
#rint(self.userIndex)
cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
#数据清洗类
class DataCleaner:
def __init__(self):
#一些字符串转数值的方法
#载入locale
self.localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
self.localeIdMap[l] = i + 1
#载入country
self.countryIdMap = defaultdict(int)
ctryIdx = defaultdict(int)
for i, c in enumerate(pycountry.countries):
self.countryIdMap[c.name.lower()] = i + 1
if c.name.lower() == 'usa':
ctryIdx['US'] = i
if c.name.lower() == 'canada':
ctryIdx['CA'] = i
for cc in ctryIdx.keys():
for s in pycountry.subdivisions.get(country_code=cc):
self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
#处理LocaleId
def getLocaleId(self, locstr):
#这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
return self.localeIdMap[ locstr.lower() ]
#处理birthyear
def getBirthYearInt(self, birthYear):
try:
return 0 if birthYear == 'None' else int(birthYear)
except:
return 0
#性别处理
def getGenderId(self, genderStr):
return self.genderIdMap[genderStr]
#joinedAt
def getJoinedYearMonth(self, dateString):
dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
return "".join( [str(dttm.year), str(dttm.month) ] )
#处理location
def getCountryId(self, location):
if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind(' ') > -1:
return self.countryIdMap[ location[location.rindex(' ') + 2: ].lower() ]
else:
return 0
#处理timezone
def getTimezoneInt(self, timezone):
try:
return int(timezone)
except:
return 0
def getFeatureHash(self, value):
if len(value.strip()) == 0:
return -1
else:
#return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
#TypeError: Unicode-objects must be encoded before hashing
return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
def getFloatValue(self, value):
if len(value.strip()) == 0:
return 0.0
else:
return float(value)
#用户与用户相似度矩阵
class Users:
"""
构建user/user相似度矩阵
"""
def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
cleaner = DataCleaner()
nusers = len(programEntities.userIndex.keys())#3391
#print(nusers)
fin = open('users.csv')
colnames = fin.readline().strip().split(',') #7列特征
self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
for line in fin:
cols = line.strip().split(',')
#只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
#userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
if cols[0] in programEntities.userIndex:
i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
fin.close()
#归一化矩阵
self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('US_userMatrix', self.userMatrix)
#计算用户相似度矩阵,之后会用到
self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
for i in range(0, nusers):
self.userSimMatrix[i, i] = 1.0
for u1, u2 in programEntities.uniqueUserPairs:
i = programEntities.userIndex[u1]
j = programEntities.userIndex[u2]
if (i, j) not in self.userSimMatrix:
#print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
#print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
self.userSimMatrix[i, j] = usim
self.userSimMatrix[j, i] = usim
sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
#用户社交关系挖掘
class UserFriends:
"""
找出某用户的那些朋友,想法非常简单
1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
"""
def __init__(self, programEntities):
nusers = len(programEntities.userIndex.keys())#3391
self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
self.userFriends = ss.dok_matrix( (nusers, nusers) )
fin = gzip.open('user_friends.csv.gz')
print( 'Header In User_friends.csv.gz:',fin.readline() )
ln = 0
#逐行打开user_friends.csv.gz文件
#判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
#获取该用户的Index,和朋友数目
#对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
#score即为该朋友对所有events的平均分
#userFriends矩阵记录了用户和朋友之间的score
#如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
#那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
for line in fin:
if ln % 200 == 0:
print( 'Loading line:', ln )
cols = line.decode().strip().split(',')
user = cols[0]
if user in programEntities.userIndex:
friends = cols[1].split(' ')#获得该用户的朋友列表
i = programEntities.userIndex[user]
self.numFriends[i] = len(friends)
for friend in friends:
if friend in programEntities.userIndex:
j = programEntities.userIndex[friend]
#the objective of this score is to infer the degree to
#and direction in which this friend will influence the
#user's decision, so we sum the user/event score for
#this user across all training events
eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
#print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
#socre即是用户朋友在13418个events上的平均分
score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
#print(score)
self.userFriends[i, j] += score
self.userFriends[j, i] += score
ln += 1
fin.close()
#归一化数组
sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
#print(sumNumFriends)
self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
sio.mmwrite('UF_userFriends', self.userFriends)
#构造event和event相似度数据
class Events:
"""
构建event-event相似度,注意这里有2种相似度
1)由用户-event行为,类似协同过滤算出的相似度
2)由event本身的内容(event信息)计算出的event-event相似度
"""
def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
cleaner = DataCleaner()
fin = gzip.open('events.csv.gz')
fin.readline()#skip header
nevents = len(programEntities.eventIndex)
print(nevents)#13418
self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )
self.eventContMatrix = ss.dok_matrix( (nevents, 100) )
ln = 0
for line in fin:
#if ln > 10:
#break
cols = line.decode().strip().split(',')
eventId = cols[0]
if eventId in programEntities.eventIndex:
i = programEntities.eventIndex[eventId]
self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
for j in range(9, 109):
self.eventContMatrix[i, j-9] = cols[j]
ln += 1
fin.close()
self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
#calculate similarity between event pairs based on the two matrices
self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
self.eventContSim = ss.dok_matrix( (nevents, nevents) )
for e1, e2 in programEntities.uniqueEventPairs:
i = programEntities.eventIndex[e1]
j = programEntities.eventIndex[e2]
if not ((i, j) in self.eventPropSim):
epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
if np.isnan(epsim):
epsim = 0
self.eventPropSim[i, j] = epsim
self.eventPropSim[j, i] = epsim
if not ((i, j) in self.eventContSim):
#两个向量,如果某个全为0,会返回nan
"""
import numpy as np
a = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])
b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
from scipy.spatial.distance import cosine
temp = cosine(a, b)
会出现下面问题:
Warning (from warnings module):
File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644
dist = 1.0 - uv / np.sqrt(uu * vv)
RuntimeWarning: invalid value encountered in double_scalars
"""
ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
if np.isnan(ecsim):
ecsim = 0
self.eventContSim[i, j] = ecsim
self.eventContSim[j, i] = ecsim
sio.mmwrite('EV_eventPropSim', self.eventPropSim)
sio.mmwrite('EV_eventContSim', self.eventContSim)
#第五步
class EventAttendees:
"""
统计某个活动,参加和不参加的人数,从而为活动活跃度做准备
"""
def __init__(self, programEntities):
nevents = len(programEntities.eventIndex)#13418 事件的总数
self.eventPopularity = ss.dok_matrix( (nevents, 1) )
f = gzip.open('event_attendees.csv.gz')
f.readline()#skip header
for line in f:
cols = line.decode().strip().split(',')
eventId = cols[0]
if eventId in programEntities.eventIndex:
i = programEntities.eventIndex[eventId]
self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人数-no人数,即出席人数减未出席人数
f.close()
self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)
sio.mmwrite('EA_eventPopularity', self.eventPopularity)
def data_prepare():
"""
计算生成所有的数据,用矩阵或者其他形式存储方便后续提取特征和建模
"""
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')
print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
print('第4步:计算event相似度信息,并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')
print('第5步:计算event热度信息...')
EventAttendees(pe)
print('第5步完成...\n')
#运行进行数据准备
data_prepare()
6.特征构建
#这是特征构建部分
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.io as sio
import numpy as np
class DataRewriter:
def __init__(self):
#读入数据做初始化
self.userIndex = cPickle.load( open('PE_userIndex.pkl','rb') )
self.eventIndex = cPickle.load( open('PE_eventIndex.pkl', 'rb') )
self.userEventScores = sio.mmread('PE_userEventScores').todense()
self.userSimMatrix = sio.mmread('US_userSimMatrix').todense()
self.eventPropSim = sio.mmread('EV_eventPropSim').todense()
self.eventContSim = sio.mmread('EV_eventContSim').todense()
self.numFriends = sio.mmread('UF_numFriends')
self.userFriends = sio.mmread('UF_userFriends').todense()
self.eventPopularity = sio.mmread('EA_eventPopularity').todense()
def userReco(self, userId, eventId):
"""
根据User-based协同过滤,得到event的推荐度
基本的伪代码思路如下:
for item in i
for every other user v that has a preference for i
compute similarity s between u and v
incorporate v's preference for i weighted by s into running average
return top items ranked by weighted average
"""
i = self.userIndex[userId]
j = self.eventIndex[eventId]
vs = self.userEventScores[:, j]
sims = self.userSimMatrix[i, :]
prod = sims * vs
try:
return prod[0, 0] - self.userEventScores[i, j]
except IndexError:
return 0
def eventReco(self, userId, eventId):
"""
根据基于物品的协同过滤,得到Event的推荐度
基本的伪代码思路:
for item i:
for every item j that u has a preference for
compute similarity s between i and j
add u's preference for j weighted by s to a running average
return top items, ranked by weighted average
"""
i = self.userIndex[userId]
j = self.eventIndex[eventId]
js = self.userEventScores[i, :]
psim = self.eventPropSim[:, j]
csim = self.eventContSim[:, j]
pprod = js * psim
cprod = js * csim
pscore = 0
cscore = 0
try:
pscore = pprod[0, 0] - self.userEventScores[i, j]
except IndexError:
pass
try:
cscore = cprod[0, 0] - self.userEventScores[i, j]
except IndexError:
pass
return pscore, cscore
def userPop(self, userId):
"""
基于用户的朋友个数来推断用户的社交程度
主要的考量是如果用户的朋友非常多,可能会更倾向于参加各种社交活动
"""
if userId in self.userIndex:
i = self.userIndex[userId]
try:
return self.numFriends[0, i]
except IndexError:
return 0
else:
return 0
def friendInfluence(self, userId):
"""
朋友对用户的影响
主要考虑用户的所有朋友中,有多少是非常喜欢参加各种社交活动(event)的
用户的朋友圈如果都是积极参加各种event,可能会对当前用户有一定的影响
"""
nusers = np.shape(self.userFriends)[1]
i = self.userIndex[userId]
#下面的一行代码是不是有问题呢?
#是不是应该为某个用户的所有朋友的兴趣分之和,然后除以nusers,也就是axis应该=1
return (self.userFriends[i, :].sum(axis=0) / nusers)[0, 0]
def eventPop(self, eventId):
"""
活动本身的热度
主要通过参与的参数来界定的
"""
i = self.eventIndex[eventId]
return self.eventPopularity[i, 0]
def rewriteData(self, start=1, train=True, header=True):
"""
把前面user-based协同过滤和item-based协同过滤以及各种热度和影响度作为特征组合在一起
生成新的train,用于分类器分类使用
"""
fn = 'train.csv' if train else 'test.csv'
fin = open(fn)
fout = open('data_' + fn, 'w')
#write output header
if header:
ocolnames = ['invited', 'user_reco', 'evt_p_reco', 'evt_c_reco', 'user_pop', 'frnd_infl', 'evt_pop']
if train:
ocolnames.append('interested')
ocolnames.append('not_interested')
fout.write( ','.join(ocolnames) + '\n' )
ln = 0
for line in fin:
ln += 1
if ln < start:
continue
cols = line.strip().split(',')
#user,event,invited,timestamp,interested,not_interested
userId = cols[0]
eventId = cols[1]
invited = cols[2]
if ln % 500 == 0:
print("%s : %d (userId, eventId) = (%s, %s)" % (fn, ln, userId, eventId))
user_reco = self.userReco( userId, eventId )
evt_p_reco, evt_c_reco = self.eventReco( userId, eventId )
user_pop = self.userPop( userId )
frnd_infl = self.friendInfluence( userId )
evt_pop = self.eventPop( eventId )
ocols = [invited, user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]
if train:
ocols.append( cols[4] )#interested
ocols.append( cols[5] )#not_interested
fout.write(','.join( map(lambda x: str(x), ocols)) + '\n')
fin.close()
fout.close()
def rewriteTrainingSet(self):
self.rewriteData(True)
def rewriteTestSet(self):
self.rewriteData(False)
dr = DataRewriter()
print('生成训练数据...\n')
dr.rewriteData(train=True, start=2, header=True)
print('生成预测数据...\n')
dr.rewriteData(train=False, start=2, header=True)
print('done')
第七步:模型构建与预测
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
def train():
"""
在我们得到的特征上训练分类器,target为1(感兴趣),或者是0(不感兴趣)
"""
trainDf = pd.read_csv('data_train.csv')
X = np.matrix( pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
y = np.array(trainDf.interested)
clf = SGDClassifier(loss='log', penalty='l2')
clf.fit(X, y)
return clf
def validate():
"""
10折的交叉验证,并输出交叉验证的平均准确率
"""
trainDf = pd.read_csv('data_train.csv')
X = np.matrix(pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
y = np.array(trainDf.interested)
nrows = len(trainDf)
kfold = KFold(n_splits=10,shuffle=False)
avgAccuracy = 0
run = 0
for train, test in kfold.split(X, y):
Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
clf = SGDClassifier(loss='log', penalty='l2')
clf.fit(Xtrain, ytrain)
accuracy = 0
ntest = len(ytest)
for i in range(0, ntest):
yt = clf.predict(Xtest[i, :])
if yt == ytest[i]:
accuracy += 1
accuracy = accuracy / ntest
print('accuracy(run %d) : %f' % (run, accuracy) )
def test(clf):
"""
读取test数据,用分类器完成预测
"""
origTestDf = pd.read_csv("test.csv")
users = origTestDf.user
events = origTestDf.event
testDf = pd.read_csv("data_test.csv")
fout = open("result.csv", 'w')
fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
nrows = len(testDf)
Xp = np.matrix(testDf)
yp = np.zeros((nrows, 2))
for i in range(0, nrows):
xp = Xp[i, :]
yp[i, 0] = clf.predict(xp)
yp[i, 1] = clf.decision_function(xp)
fout.write(",".join( map( lambda x: str(x), [users[i], events[i], yp[i, 0], yp[i, 1]] ) ) + "\n")
fout.close()
clf = train()
validate()
test(clf)
print('done')