Event Recommendation Engine Challenge（基础版）---代码

最新推荐文章于 2021-07-01 16:52:01 发布

Zero-One-0101

最新推荐文章于 2021-07-01 16:52:01 发布

阅读量1.1w

点赞数 1

CC 4.0 BY-SA版权

分类专栏： ML&DL-项目类文章标签：机器学习实战推荐系统 kaggle比赛

本文链接：https://blog.youkuaiyun.com/wangzi11111111/article/details/88930948

ML&DL-项目类专栏收录该内容

8 篇文章

订阅专栏

第一步：统计user和event相关信息

#查看train_csv的数据
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head()

	user	event	timestamp	interested
0	3044012	1918771225	2012-10-02 15:53:05.754000+00:00	0
1	3044012	1502284248	2012-10-02 15:53:05.754000+00:00	0
2	3044012	2529072432	2012-10-02 15:53:05.754000+00:00	1
3	3044012	3072478280	2012-10-02 15:53:05.754000+00:00	0
4	3044012	1390707377	2012-10-02 15:53:05.754000+00:00	0

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user              15398 non-null int64
event             15398 non-null int64
invited           15398 non-null int64
timestamp         15398 non-null object
interested        15398 non-null int64
not_interested    15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB

#查看test_csv的数据
df_test = pd.read_csv('test.csv')
df_test.head()

	user	event	timestamp
0	1776192	2877501688	2012-11-30 11:39:01.230000+00:00
1	1776192	3025444328	2012-11-30 11:39:01.230000+00:00
2	1776192	4078218285	2012-11-30 11:39:01.230000+00:00
3	1776192	1024025121	2012-11-30 11:39:01.230000+00:00
4	1776192	2972428928	2012-11-30 11:39:21.985000+00:00

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user         10237 non-null int64
event        10237 non-null int64
invited      10237 non-null int64
timestamp    10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB

前两列是用户ID和对应的event ID
而test.csv中用户缺少了标签(interested or not_interested)

#第一步的全部程序如下
from collections import defaultdict
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle

#用于分析train和test中用户和事件之间的相关性。
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，
    经过统计：train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )  #统计所有的用户有哪些
                uniqueEvents.add( cols[1] ) #统计所有的事件有哪些
                eventsForUser[cols[0]].add( cols[1] )  #将用户作为键值，保存下每个用户对应的事件
                usersForEvent[cols[1]].add( cols[0] )  #将事件作为键值，保存下每个事件对应的用户
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算，我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        #查找关联用户
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        #查找关联事件
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
 
print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')

第1步：统计user和event相关信息...
第1步完成...

pe.userEventScores

<3391x13418 sparse matrix of type '<class 'numpy.float64'>'
	with 4645 stored elements in Dictionary Of Keys format>

说明：

其中PE_userEventScores.mtx是所有users和events的矩阵，但是里面的值只有train.csv的值，值是1或者-1
scipy.sparse.dok_matrix()函数是产生一个稀疏矩阵，这样PE_userEventScores.mtx只保存了非0值
针对该步使用的变量作简单介绍：
- uniqueUsers：集合，保存train.csv和test.csv中的所有user ID
- uniqueEvents：集合，保存train.csv和test.csv中的所有event ID
- eventsForUser：字典，key为每个用户，value为该用户对应的event集合
- usersForEvent：字典，key为每个event，value为该event对应的user集合
- userIndex：字典，每个用户有个Index
- eventIndex：字典，每个event有个Index
- userEventScores：稀疏矩阵3391 * 13418，use vs event，矩阵元素为train.csv中
  每个user对某个event的兴趣分（1， 0 or -1）即interested - not_interested

import pandas as pd
pd.DataFrame(userEventScores)

userEventScores：每个user对每个event的兴趣分（1， 0 or -1）

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['event']==1502284248]
import itertools
for each in itertools.combinations(set([3044012,1302145719,3194014105,3669515588]), 2):
    print(each)

(3194014105, 3669515588)
(3194014105, 3044012)
(3194014105, 1302145719)
(3669515588, 3044012)
(3669515588, 1302145719)
(3044012, 1302145719)

uniqueUserPairs：集合，如果对于同一个event来说，关联上3个及3个以上users，则该event关联上的users进行两两配对，保存在uniqueUserPairs中，注意保存的是userId，而不是user对应的索引：

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['user']==3044012]

	user	event	timestamp	interested
0	3044012	1918771225	2012-10-02 15:53:05.754000+00:00	0
1	3044012	1502284248	2012-10-02 15:53:05.754000+00:00	0
2	3044012	2529072432	2012-10-02 15:53:05.754000+00:00	1
3	3044012	3072478280	2012-10-02 15:53:05.754000+00:00	0
4	3044012	1390707377	2012-10-02 15:53:05.754000+00:00	0
5	3044012	1532377761	2012-10-02 15:53:05.754000+00:00	0

import itertools
for each in itertools.combinations(set([1918771225,1502284248,2529072432, 3072478280, 1390707377, 1532377761    ]), 2):
    print(each)

(1532377761, 3072478280)
(1532377761, 2529072432)
(1532377761, 1390707377)
(1532377761, 1502284248)
(1532377761, 1918771225)
(3072478280, 2529072432)
(3072478280, 1390707377)
(3072478280, 1502284248)
(3072478280, 1918771225)
(2529072432, 1390707377)
(2529072432, 1502284248)
(2529072432, 1918771225)
(1390707377, 1502284248)
(1390707377, 1918771225)
(1502284248, 1918771225)

第二步：计算用户相似度

由于用到：users.csv，我们先看看其内容（看前10行）

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users.head(10)

	user_id	locale	birthyear	gender	joinedAt	location	timezone
0	3197468391	id_ID	1993	male	2012-10-02T06:40:55.524Z	Medan Indonesia	480.0
1	3537982273	id_ID	1992	male	2012-09-29T18:03:12.111Z	Medan Indonesia	420.0
2	823183725	en_US	1975	male	2012-10-06T03:14:07.149Z	Stratford Ontario	-240.0
3	1872223848	en_US	1991	female	2012-11-04T08:59:43.783Z	Tehran Iran	210.0
4	3429017717	id_ID	1995	female	2012-09-10T16:06:53.132Z	NaN	420.0
5	627175141	ka_GE	1973	female	2012-11-01T09:59:17.590Z	Tbilisi Georgia	240.0
6	2752000443	id_ID	1994	male	2012-10-03T05:22:17.637Z	Medan Indonesia	420.0
7	3473687777	id_ID	1965	female	2012-10-03T12:19:29.975Z	Medan Indonesia	420.0
8	2966052962	id_ID	1979	male	2012-10-31T10:11:57.668Z	Medan Indonesia	420.0
9	264876277	id_ID	1988	female	2012-10-02T07:28:09.555Z	Medan Indonesia	420.0

#使用locale和pycountry模块来将字符串转换成数值
import locale
locale.locale_alias

{'a3': 'az_AZ.KOI8-C',
 'a3_az': 'az_AZ.KOI8-C',
 'a3_az.koic': 'az_AZ.KOI8-C',
 'aa_dj': 'aa_DJ.ISO8859-1',
 'aa_er': 'aa_ER.UTF-8',
 'aa_et': 'aa_ET.UTF-8',
 'af': 'af_ZA.ISO8859-1',
 'af_za': 'af_ZA.ISO8859-1',
 'am': 'am_ET.UTF-8',
 'am_et': 'am_ET.UTF-8',
 'american': 'en_US.ISO8859-1',
 'an_es': 'an_ES.ISO8859-15',
 'ar': 'ar_AA.ISO8859-6',
 'ar_aa': 'ar_AA.ISO8859-6',
 'ar_ae': 'ar_AE.ISO8859-6',
 'ar_bh': 'ar_BH.ISO8859-6',
 'ar_dz': 'ar_DZ.ISO8859-6',
 'ar_eg': 'ar_EG.ISO8859-6',
 'ar_in': 'ar_IN.UTF-8',
 'ar_iq': 'ar_IQ.ISO8859-6',
 'ar_jo': 'ar_JO.ISO8859-6',
 'ar_kw': 'ar_KW.ISO8859-6',
 'ar_lb': 'ar_LB.ISO8859-6',
 'ar_ly': 'ar_LY.ISO8859-6',
 'ar_ma': 'ar_MA.ISO8859-6',
 'ar_om': 'ar_OM.ISO8859-6',
 'ar_qa': 'ar_QA.ISO8859-6',
 'ar_sa': 'ar_SA.ISO8859-6',
 'ar_sd': 'ar_SD.ISO8859-6',
 'ar_sy': 'ar_SY.ISO8859-6',
 'ar_tn': 'ar_TN.ISO8859-6',
 'ar_ye': 'ar_YE.ISO8859-6',
 'arabic': 'ar_AA.ISO8859-6',
 'as': 'as_IN.UTF-8',
 'as_in': 'as_IN.UTF-8',
 'ast_es': 'ast_ES.ISO8859-15',
 'ayc_pe': 'ayc_PE.UTF-8',
 'az': 'az_AZ.ISO8859-9E',
 'az_az': 'az_AZ.ISO8859-9E',
 'az_az.iso88599e': 'az_AZ.ISO8859-9E',
 'be': 'be_BY.CP1251',
 'be@latin': 'be_BY.UTF-8@latin',
 'be_bg.utf8': 'bg_BG.UTF-8',
 'be_by': 'be_BY.CP1251',
 'be_by@latin': 'be_BY.UTF-8@latin',
 'bem_zm': 'bem_ZM.UTF-8',
 'ber_dz': 'ber_DZ.UTF-8',
 'ber_ma': 'ber_MA.UTF-8',
 'bg': 'bg_BG.CP1251',
 'bg_bg': 'bg_BG.CP1251',
 'bho_in': 'bho_IN.UTF-8',
 'bn_bd': 'bn_BD.UTF-8',
 'bn_in': 'bn_IN.UTF-8',
 'bo_cn': 'bo_CN.UTF-8',
 'bo_in': 'bo_IN.UTF-8',
 'bokmal': 'nb_NO.ISO8859-1',
 'bokmål': 'nb_NO.ISO8859-1',
 'br': 'br_FR.ISO8859-1',
 'br_fr': 'br_FR.ISO8859-1',
 'brx_in': 'brx_IN.UTF-8',
 'bs': 'bs_BA.ISO8859-2',
 'bs_ba': 'bs_BA.ISO8859-2',
 'bulgarian': 'bg_BG.CP1251',
 'byn_er': 'byn_ER.UTF-8',
 'c': 'C',
 'c-french': 'fr_CA.ISO8859-1',
 'c.ascii': 'C',
 'c.en': 'C',
 'c.iso88591': 'en_US.ISO8859-1',
 'c.utf8': 'en_US.UTF-8',
 'c_c': 'C',
 'c_c.c': 'C',
 'ca': 'ca_ES.ISO8859-1',
 'ca_ad': 'ca_AD.ISO8859-1',
 'ca_es': 'ca_ES.ISO8859-1',
 'ca_es@valencia': 'ca_ES.ISO8859-15@valencia',
 'ca_fr': 'ca_FR.ISO8859-1',
 'ca_it': 'ca_IT.ISO8859-1',
 'catalan': 'ca_ES.ISO8859-1',
 'cextend': 'en_US.ISO8859-1',
 'chinese-s': 'zh_CN.eucCN',
 'chinese-t': 'zh_TW.eucTW',
 'crh_ua': 'crh_UA.UTF-8',
 'croatian': 'hr_HR.ISO8859-2',
 'cs': 'cs_CZ.ISO8859-2',
 'cs_cs': 'cs_CZ.ISO8859-2',
 'cs_cz': 'cs_CZ.ISO8859-2',
 'csb_pl': 'csb_PL.UTF-8',
 'cv_ru': 'cv_RU.UTF-8',
 'cy': 'cy_GB.ISO8859-1',
 'cy_gb': 'cy_GB.ISO8859-1',
 'cz': 'cs_CZ.ISO8859-2',
 'cz_cz': 'cs_CZ.ISO8859-2',
 'czech': 'cs_CZ.ISO8859-2',
 'da': 'da_DK.ISO8859-1',
 'da_dk': 'da_DK.ISO8859-1',
 'danish': 'da_DK.ISO8859-1',
 'dansk': 'da_DK.ISO8859-1',
 'de': 'de_DE.ISO8859-1',
 'de_at': 'de_AT.ISO8859-1',
 'de_be': 'de_BE.ISO8859-1',
 'de_ch': 'de_CH.ISO8859-1',
 'de_de': 'de_DE.ISO8859-1',
 'de_li.utf8': 'de_LI.UTF-8',
 'de_lu': 'de_LU.ISO8859-1',
 'deutsch': 'de_DE.ISO8859-1',
 'doi_in': 'doi_IN.UTF-8',
 'dutch': 'nl_NL.ISO8859-1',
 'dutch.iso88591': 'nl_BE.ISO8859-1',
 'dv_mv': 'dv_MV.UTF-8',
 'dz_bt': 'dz_BT.UTF-8',
 'ee': 'ee_EE.ISO8859-4',
 'ee_ee': 'ee_EE.ISO8859-4',
 'eesti': 'et_EE.ISO8859-1',
 'el': 'el_GR.ISO8859-7',
 'el_cy': 'el_CY.ISO8859-7',
 'el_gr': 'el_GR.ISO8859-7',
 'el_gr@euro': 'el_GR.ISO8859-15',
 'en': 'en_US.ISO8859-1',
 'en_ag': 'en_AG.UTF-8',
 'en_au': 'en_AU.ISO8859-1',
 'en_be': 'en_BE.ISO8859-1',
 'en_bw': 'en_BW.ISO8859-1',
 'en_ca': 'en_CA.ISO8859-1',
 'en_dk': 'en_DK.ISO8859-1',
 'en_dl.utf8': 'en_DL.UTF-8',
 'en_gb': 'en_GB.ISO8859-1',
 'en_hk': 'en_HK.ISO8859-1',
 'en_ie': 'en_IE.ISO8859-1',
 'en_in': 'en_IN.ISO8859-1',
 'en_ng': 'en_NG.UTF-8',
 'en_nz': 'en_NZ.ISO8859-1',
 'en_ph': 'en_PH.ISO8859-1',
 'en_sg': 'en_SG.ISO8859-1',
 'en_uk': 'en_GB.ISO8859-1',
 'en_us': 'en_US.ISO8859-1',
 'en_us@euro@euro': 'en_US.ISO8859-15',
 'en_za': 'en_ZA.ISO8859-1',
 'en_zm': 'en_ZM.UTF-8',
 'en_zw': 'en_ZW.ISO8859-1',
 'en_zw.utf8': 'en_ZS.UTF-8',
 'eng_gb': 'en_GB.ISO8859-1',
 'english': 'en_EN.ISO8859-1',
 'english_uk': 'en_GB.ISO8859-1',
 'english_united-states': 'en_US.ISO8859-1',
 'english_united-states.437': 'C',
 'english_us': 'en_US.ISO8859-1',
 'eo': 'eo_XX.ISO8859-3',
 'eo.utf8': 'eo.UTF-8',
 'eo_eo': 'eo_EO.ISO8859-3',
 'eo_us.utf8': 'eo_US.UTF-8',
 'eo_xx': 'eo_XX.ISO8859-3',
 'es': 'es_ES.ISO8859-1',
 'es_ar': 'es_AR.ISO8859-1',
 'es_bo': 'es_BO.ISO8859-1',
 'es_cl': 'es_CL.ISO8859-1',
 'es_co': 'es_CO.ISO8859-1',
 'es_cr': 'es_CR.ISO8859-1',
 'es_cu': 'es_CU.UTF-8',
 'es_do': 'es_DO.ISO8859-1',
 'es_ec': 'es_EC.ISO8859-1',
 'es_es': 'es_ES.ISO8859-1',
 'es_gt': 'es_GT.ISO8859-1',
 'es_hn': 'es_HN.ISO8859-1',
 'es_mx': 'es_MX.ISO8859-1',
 'es_ni': 'es_NI.ISO8859-1',
 'es_pa': 'es_PA.ISO8859-1',
 'es_pe': 'es_PE.ISO8859-1',
 'es_pr': 'es_PR.ISO8859-1',
 'es_py': 'es_PY.ISO8859-1',
 'es_sv': 'es_SV.ISO8859-1',
 'es_us': 'es_US.ISO8859-1',
 'es_uy': 'es_UY.ISO8859-1',
 'es_ve': 'es_VE.ISO8859-1',
 'estonian': 'et_EE.ISO8859-1',
 'et': 'et_EE.ISO8859-15',
 'et_ee': 'et_EE.ISO8859-15',
 'eu': 'eu_ES.ISO8859-1',
 'eu_es': 'eu_ES.ISO8859-1',
 'eu_fr': 'eu_FR.ISO8859-1',
 'fa': 'fa_IR.UTF-8',
 'fa_ir': 'fa_IR.UTF-8',
 'fa_ir.isiri3342': 'fa_IR.ISIRI-3342',
 'ff_sn': 'ff_SN.UTF-8',
 'fi': 'fi_FI.ISO8859-15',
 'fi_fi': 'fi_FI.ISO8859-15',
 'fil_ph': 'fil_PH.UTF-8',
 'finnish': 'fi_FI.ISO8859-1',
 'fo': 'fo_FO.ISO8859-1',
 'fo_fo': 'fo_FO.ISO8859-1',
 'fr': 'fr_FR.ISO8859-1',
 'fr_be': 'fr_BE.ISO8859-1',
 'fr_ca': 'fr_CA.ISO8859-1',
 'fr_ch': 'fr_CH.ISO8859-1',
 'fr_fr': 'fr_FR.ISO8859-1',
 'fr_lu': 'fr_LU.ISO8859-1',
 'français': 'fr_FR.ISO8859-1',
 'fre_fr': 'fr_FR.ISO8859-1',
 'french': 'fr_FR.ISO8859-1',
 'french.iso88591': 'fr_CH.ISO8859-1',
 'french_france': 'fr_FR.ISO8859-1',
 'fur_it': 'fur_IT.UTF-8',
 'fy_de': 'fy_DE.UTF-8',
 'fy_nl': 'fy_NL.UTF-8',
 'ga': 'ga_IE.ISO8859-1',
 'ga_ie': 'ga_IE.ISO8859-1',
 'galego': 'gl_ES.ISO8859-1',
 'galician': 'gl_ES.ISO8859-1',
 'gd': 'gd_GB.ISO8859-1',
 'gd_gb': 'gd_GB.ISO8859-1',
 'ger_de': 'de_DE.ISO8859-1',
 'german': 'de_DE.ISO8859-1',
 'german.iso88591': 'de_CH.ISO8859-1',
 'german_germany': 'de_DE.ISO8859-1',
 'gez_er': 'gez_ER.UTF-8',
 'gez_et': 'gez_ET.UTF-8',
 'gl': 'gl_ES.ISO8859-1',
 'gl_es': 'gl_ES.ISO8859-1',
 'greek': 'el_GR.ISO8859-7',
 'gu_in': 'gu_IN.UTF-8',
 'gv': 'gv_GB.ISO8859-1',
 'gv_gb': 'gv_GB.ISO8859-1',
 'ha_ng': 'ha_NG.UTF-8',
 'he': 'he_IL.ISO8859-8',
 'he_il': 'he_IL.ISO8859-8',
 'hebrew': 'he_IL.ISO8859-8',
 'hi': 'hi_IN.ISCII-DEV',
 'hi_in': 'hi_IN.ISCII-DEV',
 'hi_in.isciidev': 'hi_IN.ISCII-DEV',
 'hne': 'hne_IN.UTF-8',
 'hne_in': 'hne_IN.UTF-8',
 'hr': 'hr_HR.ISO8859-2',
 'hr_hr': 'hr_HR.ISO8859-2',
 'hrvatski': 'hr_HR.ISO8859-2',
 'hsb_de': 'hsb_DE.ISO8859-2',
 'ht_ht': 'ht_HT.UTF-8',
 'hu': 'hu_HU.ISO8859-2',
 'hu_hu': 'hu_HU.ISO8859-2',
 'hungarian': 'hu_HU.ISO8859-2',
 'hy_am': 'hy_AM.UTF-8',
 'hy_am.armscii8': 'hy_AM.ARMSCII_8',
 'ia': 'ia.UTF-8',
 'ia_fr': 'ia_FR.UTF-8',
 'icelandic': 'is_IS.ISO8859-1',
 'id': 'id_ID.ISO8859-1',
 'id_id': 'id_ID.ISO8859-1',
 'ig_ng': 'ig_NG.UTF-8',
 'ik_ca': 'ik_CA.UTF-8',
 'in': 'id_ID.ISO8859-1',
 'in_id': 'id_ID.ISO8859-1',
 'is': 'is_IS.ISO8859-1',
 'is_is': 'is_IS.ISO8859-1',
 'iso-8859-1': 'en_US.ISO8859-1',
 'iso-8859-15': 'en_US.ISO8859-15',
 'iso8859-1': 'en_US.ISO8859-1',
 'iso8859-15': 'en_US.ISO8859-15',
 'iso_8859_1': 'en_US.ISO8859-1',
 'iso_8859_15': 'en_US.ISO8859-15',
 'it': 'it_IT.ISO8859-1',
 'it_ch': 'it_CH.ISO8859-1',
 'it_it': 'it_IT.ISO8859-1',
 'italian': 'it_IT.ISO8859-1',
 'iu': 'iu_CA.NUNACOM-8',
 'iu_ca': 'iu_CA.NUNACOM-8',
 'iu_ca.nunacom8': 'iu_CA.NUNACOM-8',
 'iw': 'he_IL.ISO8859-8',
 'iw_il': 'he_IL.ISO8859-8',
 'iw_il.utf8': 'iw_IL.UTF-8',
 'ja': 'ja_JP.eucJP',
 'ja_jp': 'ja_JP.eucJP',
 'ja_jp.euc': 'ja_JP.eucJP',
 'ja_jp.mscode': 'ja_JP.SJIS',
 'ja_jp.pck': 'ja_JP.SJIS',
 'japan': 'ja_JP.eucJP',
 'japanese': 'ja_JP.eucJP',
 'japanese-euc': 'ja_JP.eucJP',
 'japanese.euc': 'ja_JP.eucJP',
 'jp_jp': 'ja_JP.eucJP',
 'ka': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge.georgianacademy': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge.georgianps': 'ka_GE.GEORGIAN-PS',
 'ka_ge.georgianrs': 'ka_GE.GEORGIAN-ACADEMY',
 'kk_kz': 'kk_KZ.RK1048',
 'kl': 'kl_GL.ISO8859-1',
 'kl_gl': 'kl_GL.ISO8859-1',
 'km_kh': 'km_KH.UTF-8',
 'kn': 'kn_IN.UTF-8',
 'kn_in': 'kn_IN.UTF-8',
 'ko': 'ko_KR.eucKR',
 'ko_kr': 'ko_KR.eucKR',
 'ko_kr.euc': 'ko_KR.eucKR',
 'kok_in': 'kok_IN.UTF-8',
 'korean': 'ko_KR.eucKR',
 'korean.euc': 'ko_KR.eucKR',
 'ks': 'ks_IN.UTF-8',
 'ks_in': 'ks_IN.UTF-8',
 'ks_in@devanagari.utf8': 'ks_IN.UTF-8@devanagari',
 'ku_tr': 'ku_TR.ISO8859-9',
 'kw': 'kw_GB.ISO8859-1',
 'kw_gb': 'kw_GB.ISO8859-1',
 'ky': 'ky_KG.UTF-8',
 'ky_kg': 'ky_KG.UTF-8',
 'lb_lu': 'lb_LU.UTF-8',
 'lg_ug': 'lg_UG.ISO8859-10',
 'li_be': 'li_BE.UTF-8',
 'li_nl': 'li_NL.UTF-8',
 'lij_it': 'lij_IT.UTF-8',
 'lithuanian': 'lt_LT.ISO8859-13',
 'lo': 'lo_LA.MULELAO-1',
 'lo_la': 'lo_LA.MULELAO-1',
 'lo_la.cp1133': 'lo_LA.IBM-CP1133',
 'lo_la.ibmcp1133': 'lo_LA.IBM-CP1133',
 'lo_la.mulelao1': 'lo_LA.MULELAO-1',
 'lt': 'lt_LT.ISO8859-13',
 'lt_lt': 'lt_LT.ISO8859-13',
 'lv': 'lv_LV.ISO8859-13',
 'lv_lv': 'lv_LV.ISO8859-13',
 'mag_in': 'mag_IN.UTF-8',
 'mai': 'mai_IN.UTF-8',
 'mai_in': 'mai_IN.UTF-8',
 'mg_mg': 'mg_MG.ISO8859-15',
 'mhr_ru': 'mhr_RU.UTF-8',
 'mi': 'mi_NZ.ISO8859-1',
 'mi_nz': 'mi_NZ.ISO8859-1',
 'mk': 'mk_MK.ISO8859-5',
 'mk_mk': 'mk_MK.ISO8859-5',
 'ml': 'ml_IN.UTF-8',
 'ml_in': 'ml_IN.UTF-8',
 'mn_mn': 'mn_MN.UTF-8',
 'mni_in': 'mni_IN.UTF-8',
 'mr': 'mr_IN.UTF-8',
 'mr_in': 'mr_IN.UTF-8',
 'ms': 'ms_MY.ISO8859-1',
 'ms_my': 'ms_MY.ISO8859-1',
 'mt': 'mt_MT.ISO8859-3',
 'mt_mt': 'mt_MT.ISO8859-3',
 'my_mm': 'my_MM.UTF-8',
 'nan_tw@latin': 'nan_TW.UTF-8@latin',
 'nb': 'nb_NO.ISO8859-1',
 'nb_no': 'nb_NO.ISO8859-1',
 'nds_de': 'nds_DE.UTF-8',
 'nds_nl': 'nds_NL.UTF-8',
 'ne_np': 'ne_NP.UTF-8',
 'nhn_mx': 'nhn_MX.UTF-8',
 'niu_nu': 'niu_NU.UTF-8',
 'niu_nz': 'niu_NZ.UTF-8',
 'nl': 'nl_NL.ISO8859-1',
 'nl_aw': 'nl_AW.UTF-8',
 'nl_be': 'nl_BE.ISO8859-1',
 'nl_nl': 'nl_NL.ISO8859-1',
 'nn': 'nn_NO.ISO8859-1',
 'nn_no': 'nn_NO.ISO8859-1',
 'no': 'no_NO.ISO8859-1',
 'no@nynorsk': 'ny_NO.ISO8859-1',
 'no_no': 'no_NO.ISO8859-1',
 'no_no.iso88591@bokmal': 'no_NO.ISO8859-1',
 'no_no.iso88591@nynorsk': 'no_NO.ISO8859-1',
 'norwegian': 'no_NO.ISO8859-1',
 'nr': 'nr_ZA.ISO8859-1',
 'nr_za': 'nr_ZA.ISO8859-1',
 'nso': 'nso_ZA.ISO8859-15',
 'nso_za': 'nso_ZA.ISO8859-15',
 'ny': 'ny_NO.ISO8859-1',
 'ny_no': 'ny_NO.ISO8859-1',
 'nynorsk': 'nn_NO.ISO8859-1',
 'oc': 'oc_FR.ISO8859-1',
 'oc_fr': 'oc_FR.ISO8859-1',
 'om_et': 'om_ET.UTF-8',
 'om_ke': 'om_KE.ISO8859-1',
 'or': 'or_IN.UTF-8',
 'or_in': 'or_IN.UTF-8',
 'os_ru': 'os_RU.UTF-8',
 'pa': 'pa_IN.UTF-8',
 'pa_in': 'pa_IN.UTF-8',
 'pa_pk': 'pa_PK.UTF-8',
 'pap_an': 'pap_AN.UTF-8',
 'pd': 'pd_US.ISO8859-1',
 'pd_de': 'pd_DE.ISO8859-1',
 'pd_us': 'pd_US.ISO8859-1',
 'ph': 'ph_PH.ISO8859-1',
 'ph_ph': 'ph_PH.ISO8859-1',
 'pl': 'pl_PL.ISO8859-2',
 'pl_pl': 'pl_PL.ISO8859-2',
 'polish': 'pl_PL.ISO8859-2',
 'portuguese': 'pt_PT.ISO8859-1',
 'portuguese_brazil': 'pt_BR.ISO8859-1',
 'posix': 'C',
 'posix-utf2': 'C',
 'pp': 'pp_AN.ISO8859-1',
 'pp_an': 'pp_AN.ISO8859-1',
 'ps_af': 'ps_AF.UTF-8',
 'pt': 'pt_PT.ISO8859-1',
 'pt_br': 'pt_BR.ISO8859-1',
 'pt_pt': 'pt_PT.ISO8859-1',
 'ro': 'ro_RO.ISO8859-2',
 'ro_ro': 'ro_RO.ISO8859-2',
 'romanian': 'ro_RO.ISO8859-2',
 'ru': 'ru_RU.UTF-8',
 'ru_ru': 'ru_RU.UTF-8',
 'ru_ua': 'ru_UA.KOI8-U',
 'rumanian': 'ro_RO.ISO8859-2',
 'russian': 'ru_RU.ISO8859-5',
 'rw': 'rw_RW.ISO8859-1',
 'rw_rw': 'rw_RW.ISO8859-1',
 'sa_in': 'sa_IN.UTF-8',
 'sat_in': 'sat_IN.UTF-8',
 'sc_it': 'sc_IT.UTF-8',
 'sd': 'sd_IN.UTF-8',
 'sd_in': 'sd_IN.UTF-8',
 'sd_in@devanagari.utf8': 'sd_IN.UTF-8@devanagari',
 'sd_pk': 'sd_PK.UTF-8',
 'se_no': 'se_NO.UTF-8',
 'serbocroatian': 'sr_RS.UTF-8@latin',
 'sh': 'sr_RS.UTF-8@latin',
 'sh_ba.iso88592@bosnia': 'sr_CS.ISO8859-2',
 'sh_hr': 'sh_HR.ISO8859-2',
 'sh_hr.iso88592': 'hr_HR.ISO8859-2',
 'sh_sp': 'sr_CS.ISO8859-2',
 'sh_yu': 'sr_RS.UTF-8@latin',
 'shs_ca': 'shs_CA.UTF-8',
 'si': 'si_LK.UTF-8',
 'si_lk': 'si_LK.UTF-8',
 'sid_et': 'sid_ET.UTF-8',
 'sinhala': 'si_LK.UTF-8',
 'sk': 'sk_SK.ISO8859-2',
 'sk_sk': 'sk_SK.ISO8859-2',
 'sl': 'sl_SI.ISO8859-2',
 'sl_cs': 'sl_CS.ISO8859-2',
 'sl_si': 'sl_SI.ISO8859-2',
 'slovak': 'sk_SK.ISO8859-2',
 'slovene': 'sl_SI.ISO8859-2',
 'slovenian': 'sl_SI.ISO8859-2',
 'so_dj': 'so_DJ.ISO8859-1',
 'so_et': 'so_ET.UTF-8',
 'so_ke': 'so_KE.ISO8859-1',
 'so_so': 'so_SO.ISO8859-1',
 'sp': 'sr_CS.ISO8859-5',
 'sp_yu': 'sr_CS.ISO8859-5',
 'spanish': 'es_ES.ISO8859-1',
 'spanish_spain': 'es_ES.ISO8859-1',
 'sq': 'sq_AL.ISO8859-2',
 'sq_al': 'sq_AL.ISO8859-2',
 'sq_mk': 'sq_MK.UTF-8',
 'sr': 'sr_RS.UTF-8',
 'sr@cyrillic': 'sr_RS.UTF-8',
 'sr@latn': 'sr_CS.UTF-8@latin',
 'sr_cs': 'sr_CS.UTF-8',
 'sr_cs.iso88592@latn': 'sr_CS.ISO8859-2',
 'sr_cs@latn': 'sr_CS.UTF-8@latin',
 'sr_me': 'sr_ME.UTF-8',
 'sr_rs': 'sr_RS.UTF-8',
 'sr_rs@latn': 'sr_RS.UTF-8@latin',
 'sr_sp': 'sr_CS.ISO8859-2',
 'sr_yu': 'sr_RS.UTF-8@latin',
 'sr_yu.cp1251@cyrillic': 'sr_CS.CP1251',
 'sr_yu.iso88592': 'sr_CS.ISO8859-2',
 'sr_yu.iso88595': 'sr_CS.ISO8859-5',
 'sr_yu.iso88595@cyrillic': 'sr_CS.ISO8859-5',
 'sr_yu.microsoftcp1251@cyrillic': 'sr_CS.CP1251',
 'sr_yu.utf8': 'sr_RS.UTF-8',
 'sr_yu.utf8@cyrillic': 'sr_RS.UTF-8',
 'sr_yu@cyrillic': 'sr_RS.UTF-8',
 'ss': 'ss_ZA.ISO8859-1',
 'ss_za': 'ss_ZA.ISO8859-1',
 'st': 'st_ZA.ISO8859-1',
 'st_za': 'st_ZA.ISO8859-1',
 'sv': 'sv_SE.ISO8859-1',
 'sv_fi': 'sv_FI.ISO8859-1',
 'sv_se': 'sv_SE.ISO8859-1',
 'sw_ke': 'sw_KE.UTF-8',
 'sw_tz': 'sw_TZ.UTF-8',
 'swedish': 'sv_SE.ISO8859-1',
 'szl_pl': 'szl_PL.UTF-8',
 'ta': 'ta_IN.TSCII-0',
 'ta_in': 'ta_IN.TSCII-0',
 'ta_in.tscii': 'ta_IN.TSCII-0',
 'ta_in.tscii0': 'ta_IN.TSCII-0',
 'ta_lk': 'ta_LK.UTF-8',
 'te': 'te_IN.UTF-8',
 'te_in': 'te_IN.UTF-8',
 'tg': 'tg_TJ.KOI8-C',
 'tg_tj': 'tg_TJ.KOI8-C',
 'th': 'th_TH.ISO8859-11',
 'th_th': 'th_TH.ISO8859-11',
 'th_th.tactis': 'th_TH.TIS620',
 'th_th.tis620': 'th_TH.TIS620',
 'thai': 'th_TH.ISO8859-11',
 'ti_er': 'ti_ER.UTF-8',
 'ti_et': 'ti_ET.UTF-8',
 'tig_er': 'tig_ER.UTF-8',
 'tk_tm': 'tk_TM.UTF-8',
 'tl': 'tl_PH.ISO8859-1',
 'tl_ph': 'tl_PH.ISO8859-1',
 'tn': 'tn_ZA.ISO8859-15',
 'tn_za': 'tn_ZA.ISO8859-15',
 'tr': 'tr_TR.ISO8859-9',
 'tr_cy': 'tr_CY.ISO8859-9',
 'tr_tr': 'tr_TR.ISO8859-9',
 'ts': 'ts_ZA.ISO8859-1',
 'ts_za': 'ts_ZA.ISO8859-1',
 'tt': 'tt_RU.TATAR-CYR',
 'tt_ru': 'tt_RU.TATAR-CYR',
 'tt_ru.tatarcyr': 'tt_RU.TATAR-CYR',
 'tt_ru@iqtelif': 'tt_RU.UTF-8@iqtelif',
 'turkish': 'tr_TR.ISO8859-9',
 'ug_cn': 'ug_CN.UTF-8',
 'uk': 'uk_UA.KOI8-U',
 'uk_ua': 'uk_UA.KOI8-U',
 'univ': 'en_US.utf',
 'universal': 'en_US.utf',
 'universal.utf8@ucs4': 'en_US.UTF-8',
 'unm_us': 'unm_US.UTF-8',
 'ur': 'ur_PK.CP1256',
 'ur_in': 'ur_IN.UTF-8',
 'ur_pk': 'ur_PK.CP1256',
 'uz': 'uz_UZ.UTF-8',
 'uz_uz': 'uz_UZ.UTF-8',
 'uz_uz@cyrillic': 'uz_UZ.UTF-8',
 've': 've_ZA.UTF-8',
 've_za': 've_ZA.UTF-8',
 'vi': 'vi_VN.TCVN',
 'vi_vn': 'vi_VN.TCVN',
 'vi_vn.tcvn': 'vi_VN.TCVN',
 'vi_vn.tcvn5712': 'vi_VN.TCVN',
 'vi_vn.viscii': 'vi_VN.VISCII',
 'vi_vn.viscii111': 'vi_VN.VISCII',
 'wa': 'wa_BE.ISO8859-1',
 'wa_be': 'wa_BE.ISO8859-1',
 'wae_ch': 'wae_CH.UTF-8',
 'wal_et': 'wal_ET.UTF-8',
 'wo_sn': 'wo_SN.UTF-8',
 'xh': 'xh_ZA.ISO8859-1',
 'xh_za': 'xh_ZA.ISO8859-1',
 'yi': 'yi_US.CP1255',
 'yi_us': 'yi_US.CP1255',
 'yo_ng': 'yo_NG.UTF-8',
 'yue_hk': 'yue_HK.UTF-8',
 'zh': 'zh_CN.eucCN',
 'zh_cn': 'zh_CN.gb2312',
 'zh_cn.big5': 'zh_TW.big5',
 'zh_cn.euc': 'zh_CN.eucCN',
 'zh_hk': 'zh_HK.big5hkscs',
 'zh_hk.big5hk': 'zh_HK.big5hkscs',
 'zh_sg': 'zh_SG.GB2312',
 'zh_sg.gbk': 'zh_SG.GBK',
 'zh_tw': 'zh_TW.big5',
 'zh_tw.euc': 'zh_TW.eucTW',
 'zh_tw.euctw': 'zh_TW.eucTW',
 'zu': 'zu_ZA.ISO8859-1',
 'zu_za': 'zu_ZA.ISO8859-1'}

1.locale列处理

import locale
from collections import defaultdict
 
localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
    localeIdMap[l] = i + 1
for each in localeIdMap:
    print(each, '\t', localeIdMap[each])

ee 	 1
fr_ch 	 2
fo_fo 	 3
af_za 	 4
bn_in 	 5
mni_in 	 93
da_dk 	 8
ar_ma 	 9
ig_ng 	 369
fr_be 	 11
italian 	 12
he_il 	 13
aa_dj 	 15
ml 	 463
yue_hk 	 17
pt_br 	 19
es_mx 	 280
gu_in 	 22
sid_et 	 23
it_it 	 24
japanese 	 95
de_de 	 26
en_ag 	 523
croatian 	 27
it 	 96
cs 	 29
mn_mn 	 30
ar_bh 	 31
ro_ro 	 481
gv_gb 	 33
rw 	 34
bg_bg 	 35
ar 	 499
en_us@euro@euro 	 36
fil_ph 	 37
fr_fr 	 466
french 	 39
de 	 40
polish 	 285
kok_in 	 42
korean.euc 	 43
sr 	 44
sr_cs.iso88592@latn 	 45
pap_an 	 46
sr_yu.iso88595 	 47
turkish 	 51
c.utf8 	 52
uz_uz 	 53
lv_lv 	 429
sr_rs@latn 	 54
eo_xx 	 55
ik_ca 	 57
iso_8859_1 	 58
no_no.iso88591@bokmal 	 59
cextend 	 60
doi_in 	 225
universal 	 61
es_cr 	 62
hne_in 	 63
gd_gb 	 64
cy 	 65
nl_aw 	 66
yi 	 67
mt_mt 	 68
sk_sk 	 384
si_lk 	 71
a3_az 	 72
lt 	 500
st_za 	 73
iw 	 74
te 	 318
en_nz 	 528
en_in 	 76
zh_tw.euc 	 77
ne_np 	 49
brx_in 	 286
no 	 80
az 	 81
german.iso88591 	 475
ky 	 32
he 	 85
kn_in 	 86
id_id 	 110
mai 	 88
nb_no 	 89
czech 	 90
sq 	 91
ja 	 92
tr 	 6
german_germany 	 94
shs_ca 	 265
mr 	 28
fi_fi 	 97
wal_et 	 48
cs_cs 	 100
sd_in@devanagari.utf8 	 101
gez_er 	 102
a3 	 103
wae_ch 	 283
iu 	 106
nl 	 107
french.iso88591 	 108
japanese-euc 	 83
tig_er 	 98
hne 	 111
c.iso88591 	 112
ar_qa 	 113
chinese-t 	 114
fo 	 115
de_li.utf8 	 117
br_fr 	 118
mag_in 	 515
sv_fi 	 119
russian 	 120
pp 	 121
wa_be 	 123
norwegian 	 124
fa_ir.isiri3342 	 126
ky_kg 	 127
zh_tw.euctw 	 128
fre_fr 	 130
english_uk 	 131
arabic 	 133
fr_ca 	 134
ber_ma 	 135
ml_in 	 136
li_nl 	 137
et 	 138
fur_it 	 139
om_ke 	 140
gl 	 141
bg 	 142
is_is 	 143
sr_yu 	 282
tk_tm 	 125
en_au 	 146
fa_ir 	 147
be_bg.utf8 	 148
zu 	 303
sh_hr.iso88592 	 150
szl_pl 	 310
ar_ae 	 152
nynorsk 	 153
en_bw 	 154
iso-8859-1 	 155
tl_ph 	 518
bulgarian 	 299
ts 	 356
kn 	 159
af 	 160
wa 	 161
or_in 	 162
dansk 	 163
bs 	 522
be@latin 	 164
lij_it 	 398
ko_kr 	 167
tr_tr 	 168
ar_in 	 169
os_ru 	 170
sr_yu@cyrillic 	 171
ta_lk 	 172
sr_rs 	 490
es_ec 	 174
en_be 	 175
no_no.iso88591@nynorsk 	 176
zh_cn.big5 	 177
pt_pt 	 178
an_es 	 179
zh_hk 	 180
es_cl 	 181
unm_us 	 312
am 	 183
as 	 184
cv_ru 	 185
ar_aa 	 186
gd 	 419
ti_er 	 187
ar_lb 	 188
sp 	 189
ja_jp.euc 	 190
csb_pl 	 191
el_gr 	 192
de_be 	 193
bokmål 	 194
danish 	 195
be_by@latin 	 196
kw 	 198
iso_8859_15 	 301
sr_yu.iso88595@cyrillic 	 199
cs_cz 	 200
tn 	 201
ar_tn 	 202
or 	 203
se_no 	 204
mhr_ru 	 495
be_by 	 206
eu_fr 	 406
de_at 	 207
tr_cy 	 104
mai_in 	 209
zu_za 	 210
sh_hr 	 211
ta_in.tscii 	 212
sr_yu.utf8 	 213
de_ch 	 214
dv_mv 	 236
mk 	 215
mt 	 216
fa 	 217
tt_ru 	 218
ga_ie 	 306
iw_il 	 219
li_be 	 220
ka_ge.georgianacademy 	 221
az_az.iso88599e 	 222
eng_gb 	 223
en_zw 	 224
en_dl.utf8 	 75
estonian 	 226
es_pa 	 227
sw_ke 	 228
es_pe 	 229
pa_pk 	 230
hebrew 	 231
niu_nu 	 232
lo_la 	 233
ca_es 	 309
sq_al 	 235
ka_ge.georgianrs 	 305
ca 	 238
tt_ru.tatarcyr 	 239
zh_hk.big5hk 	 240
nb 	 241
mg_mg 	 242
eo_eo 	 510
kl_gl 	 411
lo 	 244
iu_ca 	 245
thai 	 517
as_in 	 246
en_ng 	 313
ar_om 	 248
ia 	 249
eo_us.utf8 	 250
ur_pk 	 251
vi_vn.tcvn 	 252
ar_eg 	 253
es_py 	 254
ru_ua 	 255
nn 	 256
hr 	 504
chinese-s 	 258
sc_it 	 259
ta_in.tscii0 	 260
korean 	 261
nr_za 	 262
si 	 263
zh_sg 	 264
portuguese_brazil 	 440
bokmal 	 482
ber_dz 	 266
pa 	 316
ee_ee 	 526
american 	 268
en_za 	 269
lo_la.cp1133 	 270
pa_in 	 271
en_uk 	 272
sat_in 	 273
so_so 	 274
finnish 	 275
cy_gb 	 277
mi_nz 	 278
gez_et 	 279
german 	 20
am_et 	 281
ko_kr.euc 	 543
es_cu 	 144
sd 	 69
ti_et 	 156
en_ca 	 506
sr_yu.microsoftcp1251@cyrillic 	 87
c.ascii 	 402
lv 	 287
ka_ge.georgianps 	 288
pl_pl 	 237
ar_kw 	 290
hrvatski 	 7
bo_in 	 292
dutch.iso88591 	 293
pd_de 	 294
in_id 	 296
ms 	 297
hsb_de 	 298
sr_yu.utf8@cyrillic 	 157
th_th.tis620 	 300
lb_lu 	 315
lg_ug 	 302
uz_uz@cyrillic 	 304
sh_sp 	 314
tg_tj 	 129
ku_tr 	 307
deutsch 	 105
ar_ly 	 536
nds_nl 	 390
my_mm 	 308
fy_nl 	 234
aa_er 	 151
kw_gb 	 311
hy_am 	 247
romanian 	 267
wo_sn 	 122
so_ke 	 320
sr_yu.iso88592 	 322
pl 	 295
sp_yu 	 324
be 	 325
et_ee 	 326
en_ie 	 328
es_do 	 329
en_sg 	 330
it_ch 	 331
bs_ba 	 332
el_gr@euro 	 333
sinhala 	 334
hu 	 335
tt_ru@iqtelif 	 336
ger_de 	 337
iu_ca.nunacom8 	 78
ph_ph 	 339
en_ph 	 469
rw_rw 	 393
so_et 	 340
ka 	 341
ur_in 	 205
hr_hr 	 343
ar_sa 	 344
french_france 	 345
sk 	 346
es_pr 	 347
galician 	 349
ff_sn 	 350
sq_mk 	 56
ny_no 	 352
ro 	 353
zh_cn 	 354
tt 	 355
nhn_mx 	 427
en_dk 	 372
ar_iq 	 358
lt_lt 	 359
dutch 	 360
slovenian 	 361
cz 	 362
nso_za 	 508
cz_cz 	 428
ss 	 364
ar_sy 	 365
en_gb 	 366
byn_er 	 367
ayc_pe 	 368
en_zw.utf8 	 338
ug_cn 	 14
es_ni 	 371
catalan 	 84
english_us 	 373
hi_in.isciidev 	 374
eu_es 	 422
ca_fr 	 375
vi_vn.tcvn5712 	 376
so_dj 	 50
nl_nl 	 378
en_zm 	 379
posix-utf2 	 380
el 	 525
lo_la.ibmcp1133 	 382
en 	 383
th_th 	 70
ka_ge 	 385
kk_kz 	 386
a3_az.koic 	 387
fr 	 388
de_lu 	 389
zh 	 21
es_gt 	 542
oc_fr 	 391
ta 	 392
sv_se 	 116
st 	 10
galego 	 395
eu 	 158
sr_sp 	 529
sr_yu.cp1251@cyrillic 	 166
es_ar 	 400
mk_mk 	 401
english_united-states.437 	 18
dz_bt 	 351
ga 	 432
en_us 	 404
ar_jo 	 405
es_uy 	 342
tl 	 407
c-french 	 408
english_united-states 	 409
en_hk 	 410
br 	 478
nso 	 243
spanish_spain 	 412
xh 	 413
yi_us 	 414
ps_af 	 415
zh_tw 	 416
bho_in 	 417
ia_fr 	 435
ss_za 	 418
gv 	 291
es_bo 	 420
eo 	 491
gl_es 	 421
ja_jp 	 319
tn_za 	 423
crh_ua 	 424
sw_tz 	 425
jp_jp 	 426
sh_ba.iso88592@bosnia 	 357
km_kh 	 363
sv 	 399
no@nynorsk 	 16
vi 	 403
hy_am.armscii8 	 433
ru_ru 	 434
univ 	 276
mr_in 	 436
ur 	 437
ht_ht 	 438
japan 	 439
sh 	 377
fr_lu 	 441
es_hn 	 442
ast_es 	 443
ta_in 	 444
sd_pk 	 445
portuguese 	 446
ts_za 	 447
mi 	 448
lithuanian 	 488
c.en 	 450
zh_cn.euc 	 321
az_az 	 452
ko 	 537
sr@latn 	 454
es_us 	 455
ny 	 456
is 	 182
iso8859-1 	 431
fy_de 	 197
oc 	 459
icelandic 	 460
es_es 	 461
greek 	 462
pp_an 	 284
da 	 464
ha_ng 	 465
ks_in@devanagari.utf8 	 38
el_cy 	 512
pd_us 	 467
th 	 468
ja_jp.pck 	 149
ru 	 470
c 	 396
ca_es@valencia 	 458
uk 	 472
rumanian 	 473
français 	 474
ja_jp.mscode 	 82
tg 	 476
es_sv 	 477
japanese.euc 	 99
ca_it 	 479
c_c.c 	 25
english 	 480
es_ve 	 394
kl 	 483
ve 	 484
sr_cs@latn 	 485
ar_dz 	 486
aa_et 	 487
bo_cn 	 109
iw_il.utf8 	 145
nn_no 	 489
vi_vn 	 173
spanish 	 79
ca_ad 	 492
vi_vn.viscii111 	 494
c_c 	 451
nan_tw@latin 	 370
ar_sd 	 498
vi_vn.viscii 	 496
ms_my 	 501
es_co 	 502
posix 	 503
niu_nz 	 257
ks 	 505
id 	 430
iso-8859-15 	 507
sd_in 	 327
es 	 509
th_th.tactis 	 41
iso8859-15 	 471
bn_bd 	 511
hu_hu 	 323
nds_de 	 513
nr 	 514
slovene 	 208
sl_si 	 516
ve_za 	 317
sh_yu 	 545
sr@cyrillic 	 519
slovak 	 521
pd 	 497
serbocroatian 	 132
ph 	 457
sa_in 	 381
fi 	 348
nl_be 	 527
sr_me 	 165
swedish 	 397
sl_cs 	 530
ar_ye 	 524
yo_ng 	 531
eesti 	 532
hungarian 	 533
no_no 	 534
hi 	 548
uz 	 535
in 	 449
om_et 	 453
sr_cs 	 538
xh_za 	 539
pt 	 541
universal.utf8@ucs4 	 520
ks_in 	 493
bem_zm 	 544
hi_in 	 289
eo.utf8 	 546
uk_ua 	 547
zh_sg.gbk 	 540
te_in 	 549
sl 	 550
lo_la.mulelao1 	 551

所以传给localeIdMap一个locale的字符串，就可以将其转换成数值型，如果传入的字符串不在localeIdMap的key中，则返回0，这也就体现了defaultdict(int)的作用

print(localeIdMap['en_GB'.lower()])
print(localeIdMap['en_US'.lower()])
print(localeIdMap['id_ID'.lower()])
print(localeIdMap['ka_GE'.lower()])

2.birthyear列处理

该列处理比较简单，存在就直接转换成数值，不存在就用0填充

def getBirthYearInt(birthYear):
    try:
        return 0 if birthYear=="None" else int(birthYear)
    except:
        return 0
print(getBirthYearInt(1992))
print(getBirthYearInt(None))

1992
0

3.gender列处理

male转换为1， female转换为2，空值用0填充

from collections import defaultdict
genderIdMap = defaultdict(int, {'male':1, 'female':2})
print(genderIdMap['male'])
print(genderIdMap['female'])
print(genderIdMap[None])

1
2
0

4.joinedAt列处理

我们发现该列信息有些共性特点：

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users['joinedAt'][:10]

0    2012-10-02T06:40:55.524Z
1    2012-09-29T18:03:12.111Z
2    2012-10-06T03:14:07.149Z
3    2012-11-04T08:59:43.783Z
4    2012-09-10T16:06:53.132Z
5    2012-11-01T09:59:17.590Z
6    2012-10-03T05:22:17.637Z
7    2012-10-03T12:19:29.975Z
8    2012-10-31T10:11:57.668Z
9    2012-10-02T07:28:09.555Z
Name: joinedAt, dtype: object

我们发现该列要么是None，要么是上面的时间字符串，均有T在中间和S在尾部，根据这个共性我们用datetime模块，提取时间信息：

import datetime
def getJoinedYearMonth(dateString):
    try:
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month)] )
    except:
        return 0
df_users['joinedAt'].map(getJoinedYearMonth)[:10]

0    201210
1     20129
2    201210
3    201211
4     20129
5    201211
6    201210
7    201210
8    201210
9    201210
Name: joinedAt, dtype: object

5.location列处理

我们来看看users.csv中location列信息（前20行）：

df_users['location'][:20]

0                  Medan  Indonesia
1                  Medan  Indonesia
2                Stratford  Ontario
3                      Tehran  Iran
4                               NaN
5                  Tbilisi  Georgia
6                  Medan  Indonesia
7                  Medan  Indonesia
8                  Medan  Indonesia
9                  Medan  Indonesia
10                 Medan  Indonesia
11                       Phnom Penh
12    Djokja  Yogyakarta  Indonesia
13               Triolet  Mauritius
14                              NaN
15                              NaN
16                              NaN
17              Surabaya  Indonesia
18                 Medan  Indonesia
19                              NaN
Name: location, dtype: object

我们使用pycountry模块来将此列转换为数值型，pycountry.countries是个迭代器：

import pycountry
from collections import defaultdict
countryIdMap = defaultdict(int)
for i, c in enumerate(pycountry.countries):
    countryIdMap[c.name.lower()] = i + 1
#将地址信息转换为数值型
def getCountryId(location):
    if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
        return countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
    else:
        return 0
print(getCountryId('San Dimas  California'))
print(getCountryId('Jogjakarta  Indonesia'))

0
103

我们知道许多机器学习模型只能接受数值型的数据作为模型的输入，所以在这里需要将位置信息转换为数值型的数据，常见的做法是对其做one hot处理，但是这样会造成矩阵太稀疏，我们可以使用pycountry库，对位置数据按照pycountry中存储的位置信息进行编码，使用编码来代替原始的位置信息。

6.timezone列处理

比较简单，存在值就转换为int型，不存在用0填充

def getTimezoneInt(timezone):
    try:
        return int(timezone)
    except:
        return 0
print(getTimezoneInt(-240))#-240
print(getTimezoneInt(240))
print(getTimezoneInt(None))

-240
240
0

7.将上面处理的1-6列进行归一化

self.userMatrix矩阵的处理中归一化使用了sklearn.preprocessing.normalize()函数，归一化后方便计算两个user的相似度

这里只计算Event Recommendation Engine Challenge分步解析第一步中的uniqueUserPairs，他们因为同一个event事件关联起来了，有联系

计算相关性用到了scipy.spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数（pearson correlation coefficient, Centered Cosine）

#第二步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize

#构建用户-事件矩阵类
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，
    经过统计：train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算，我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
        #处理性别信息       
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
        
#构建用户和用户之间的相似矩阵类       
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解
            #userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户
            #构造用户矩阵（将原始数据中的数据进行处理后构建）
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user：对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]  #获取用户u1的索引
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())  #计算两个用户向量之间的相似性，为对称矩阵
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix) 
         
print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步：计算用户相似度信息，并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')

第1步：统计user和event相关信息...
第1步完成...

第2步：计算用户相似度信息，并用矩阵形式存储...
第2步完成...

第三步：用户社交关系信息处理

这一步需要user_friends.csv.gz文件，我们先来看看文件内容：

import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()

	user	friends
0	3197468391	1346449342 3873244116 4226080662 1222907620 54...
1	3537982273	1491560444 395798035 2036380346 899375619 3534...
2	823183725	1484954627 1950387873 1652977611 4185960823 42...
3	1872223848	83361640 723814682 557944478 1724049724 253059...
4	3429017717	4253303705 2130310957 1838389374 3928735761 71...

1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
2）如果你朋友会参加某个活动，可能你也会跟随去参加一下

# 第三步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，
    经过统计：train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算，我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解
            #userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user：对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友，想法非常简单
    1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
    2）如果你朋友会参加某个活动，可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391  用户数目
        
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )  #记录下每个用户的朋友点击事件的次数
        
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user
        #获取该用户的Index，和朋友数目
        #对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )  #将用户-朋友数矩阵保存
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)  #将用户-朋友事件点击矩阵保存
         
print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步：计算用户相似度信息，并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
 
print('第3步：计算用户社交关系信息，并存储...')
UserFriends(pe)
print('第3步完成...\n')

第1步：统计user和event相关信息...
第1步完成...

第2步：计算用户相似度信息，并用矩阵形式存储...
第2步完成...

第3步：计算用户社交关系信息，并存储...
Header In User_friends.csv.gz: b'user,friends\n'
Loading line: 0
Loading line: 200
Loading line: 400
Loading line: 600
Loading line: 800
Loading line: 1000
Loading line: 1200
Loading line: 1400
Loading line: 1600
Loading line: 1800
Loading line: 2000
Loading line: 2200
Loading line: 2400
Loading line: 2600
Loading line: 2800
Loading line: 3000
Loading line: 3200
Loading line: 3400
Loading line: 3600
Loading line: 3800
Loading line: 4000
Loading line: 4200
Loading line: 4400
Loading line: 4600
Loading line: 4800
Loading line: 5000
Loading line: 5200
Loading line: 5400
Loading line: 5600
Loading line: 5800
Loading line: 6000
Loading line: 6200
Loading line: 6400
Loading line: 6600
Loading line: 6800
Loading line: 7000
Loading line: 7200
Loading line: 7400
Loading line: 7600
Loading line: 7800
Loading line: 8000
Loading line: 8200
Loading line: 8400
Loading line: 8600
Loading line: 8800
Loading line: 9000
Loading line: 9200
Loading line: 9400
Loading line: 9600
Loading line: 9800
Loading line: 10000
Loading line: 10200
Loading line: 10400
Loading line: 10600
Loading line: 10800
Loading line: 11000
Loading line: 11200
Loading line: 11400
Loading line: 11600
Loading line: 11800
Loading line: 12000
Loading line: 12200
Loading line: 12400
Loading line: 12600
Loading line: 12800
Loading line: 13000
Loading line: 13200
Loading line: 13400
Loading line: 13600
Loading line: 13800
Loading line: 14000
Loading line: 14200
Loading line: 14400
Loading line: 14600
Loading line: 14800
Loading line: 15000
Loading line: 15200
Loading line: 15400
Loading line: 15600
Loading line: 15800
Loading line: 16000
Loading line: 16200
Loading line: 16400
Loading line: 16600
Loading line: 16800
Loading line: 17000
Loading line: 17200
Loading line: 17400
Loading line: 17600
Loading line: 17800
Loading line: 18000
Loading line: 18200
Loading line: 18400
Loading line: 18600
Loading line: 18800
Loading line: 19000
Loading line: 19200
Loading line: 19400
Loading line: 19600
Loading line: 19800
Loading line: 20000
Loading line: 20200
Loading line: 20400
Loading line: 20600
Loading line: 20800
Loading line: 21000
Loading line: 21200
Loading line: 21400
Loading line: 21600
Loading line: 21800
Loading line: 22000
Loading line: 22200
Loading line: 22400
Loading line: 22600
Loading line: 22800
Loading line: 23000
Loading line: 23200
Loading line: 23400
Loading line: 23600
Loading line: 23800
Loading line: 24000
Loading line: 24200
Loading line: 24400
Loading line: 24600
Loading line: 24800
Loading line: 25000
Loading line: 25200
Loading line: 25400
Loading line: 25600
Loading line: 25800
Loading line: 26000
Loading line: 26200
Loading line: 26400
Loading line: 26600
Loading line: 26800
Loading line: 27000
Loading line: 27200
Loading line: 27400
Loading line: 27600
Loading line: 27800
Loading line: 28000
Loading line: 28200
Loading line: 28400
Loading line: 28600
Loading line: 28800
Loading line: 29000
Loading line: 29200
Loading line: 29400
Loading line: 29600
Loading line: 29800
Loading line: 30000
Loading line: 30200
Loading line: 30400
Loading line: 30600
Loading line: 30800
Loading line: 31000
Loading line: 31200
Loading line: 31400
Loading line: 31600
Loading line: 31800
Loading line: 32000
Loading line: 32200
Loading line: 32400
Loading line: 32600
Loading line: 32800
Loading line: 33000
Loading line: 33200
Loading line: 33400
Loading line: 33600
Loading line: 33800
Loading line: 34000
Loading line: 34200
Loading line: 34400
Loading line: 34600
Loading line: 34800
Loading line: 35000
Loading line: 35200
Loading line: 35400
Loading line: 35600
Loading line: 35800
Loading line: 36000
Loading line: 36200
Loading line: 36400
Loading line: 36600
Loading line: 36800
Loading line: 37000
Loading line: 37200
Loading line: 37400
Loading line: 37600
Loading line: 37800
Loading line: 38000
Loading line: 38200
3731377.0
第3步完成...

第四步：构建event和event相似度数据

我们先看看events.csv.gz：

import pandas as pd
df_events_csv = pd.read_csv('events.csv.gz', compression='gzip')
df_events_csv.head()

	event_id	user_id	start_time	city	state	zip	country	lat	lng	c_1	...	c_93	c_other
0	684921758	3647864012	2012-10-31T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2	...	1	9
1	244999119	3476440521	2012-11-03T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2	...	0	7
2	3928440935	517514445	2012-11-05T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	0	...	0	12
3	2582345152	781585781	2012-10-30T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1	...	0	8
4	1051165850	1016098580	2012-09-27T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1	...	0	9

5 rows × 110 columns

对上面的信息进行数值转换

1.start_time列的信息使用 datetime库进行处理

2.city，state，zip，country列处理都利用了hashlib包：注意这里处理event信息的时候，只有那些出现在train.csv和test.csv中的event才会进入数值转换程序

import hashlib
def FeatureHash(value):
        if len(value.strip()) == 0:
            return -1
        else:
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4] ,16)

print(FeatureHash('Muaraenim'))#47294
print(FeatureHash('a test demo'))#4030

47294
4030

3.lat和lon列处理

空值用0.0填充，其他转换为自身的float型

def getFloatValue(self, value):
    if len(value.strip()) == 0:
        return 0.0
    else:
        return float(value)

4.c_1之后列（也就是第10列之后）处理

这里用了一个矩阵eventContMatrix来保存c_1到c_100列信息，但是没有用的c_other列

5.将eventPropMatrix和eventContMatrix矩阵归一化后进行文件保存

6.使用uniqueEventPairs来计算event pairs相似度

利用了scipy.spatial.distance的correlation和cosine方法

## 第四步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
import hashlib
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，
    经过统计：train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算，我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
         
    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            #return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
            #TypeError: Unicode-objects must be encoded before hashing
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
     
    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)
             
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解
            #userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user：对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友，想法非常简单
    1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
    2）如果你朋友会参加某个活动，可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user
        #获取该用户的Index，和朋友数目
        #对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        #print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)
     
 
         
#构造event和event相似度数据
class Events:
    """
    构建event-event相似度，注意这里有2种相似度
    1）由用户-event行为，类似协同过滤算出的相似度
    2）由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
        cleaner = DataCleaner()
        fin = gzip.open('events.csv.gz')
        fin.readline()#skip header
        nevents = len(programEntities.eventIndex)  #事件的数目
        print(nevents)#13418
        self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )  #存储事件-前7列特征
        self.eventContMatrix = ss.dok_matrix( (nevents, 100) ) #存储事件
        ln = 0
        for line in fin:
            #if ln > 10:
                #break
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
                #将10-101列的属性进行统计排布
                for j in range(9, 109):
                    self.eventContMatrix[i, j-9] = cols[j]
                 
            ln += 1
        fin.close()
        #对特征矩阵1进行归一化处理 
        self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
        #对特征矩阵2进行规一划处理
        self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
         
        #calculate similarity between event pairs based on the two matrices
        self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
        self.eventContSim = ss.dok_matrix( (nevents, nevents) )
        for e1, e2 in programEntities.uniqueEventPairs:
            i = programEntities.eventIndex[e1]
            j = programEntities.eventIndex[e2]
            #计算前10列数据的相识度
            if not ((i, j) in self.eventPropSim):
                epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
                self.eventPropSim[i, j] = epsim
                self.eventPropSim[j, i] = epsim
            #计算后面数据的相似度
            if not ((i, j) in self.eventContSim):
                ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
                self.eventContSim[i, j] = ecsim
                self.eventContSim[j, i] = ecsim
                 
        sio.mmwrite('EV_eventPropSim', self.eventPropSim)
        sio.mmwrite('EV_eventContSim', self.eventContSim)
 
         
print('第1步：统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步：计算用户相似度信息，并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
 
print('第3步：计算用户社交关系信息，并存储...')
UserFriends(pe)
print('第3步完成...\n')
 
print('第4步：计算event相似度信息，并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')

第五步：活跃度/event热度数据

由于用到event_attendees.csv.gz文件，我们先看看该文件

import pandas as pd
df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip')
df_events_attendees.head()

	event	yes	maybe	invited	no
0	1159822043	1975964455 252302513 4226086795 3805886383 142...	2733420590 517546982 1350834692 532087573 5831...	1723091036 3795873583 4109144917 3560622906 31...	3575574655 1077296663
1	686467261	2394228942 2686116898 1056558062 3792942231 41...	1498184352 645689144 3770076778 331335845 4239...	1788073374 733302094 1830571649 676508092 7081...	NaN
2	1186208412	NaN	3320380166 3810793697	1379121209 440668682	1728988561 2950720854
3	2621578336	NaN	NaN	NaN	NaN
4	855842686	2406118796 3550897984 294255260 1125817077 109...	2671721559 1761448345 2356975806 2666669465 10...	1518670705 880919237 2326414227 2673818347 332...	3500235232

## 第五步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
import hashlib
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event，因此重点处理这部分关联数据，
    经过统计：train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户：3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events：13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event：哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算，我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int)，如果key中没有locstr.lower()，就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
         
    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            #return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
            #TypeError: Unicode-objects must be encoded before hashing
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
     
    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)
             
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户，这一行是作者注释上的，但是我不是很理解
            #userIndex包含了train和test的所有用户，为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user：对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵，之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友，想法非常简单
    1）如果你有更多的朋友，可能你性格外向，更容易参加各种活动
    2）如果你朋友会参加某个活动，可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.])，保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中，只有user在userIndex中才是我们关心的user
        #获取该用户的Index，和朋友数目
        #对于该用户的每一个朋友，如果朋友也在userIndex中，获取其朋友的userIndex，然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067：1750用户出现在test.csv中，该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应：0， 1， or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        #print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)
     
 
         
#构造event和event相似度数据
class Events:
    """
    构建event-event相似度，注意这里有2种相似度
    1）由用户-event行为，类似协同过滤算出的相似度
    2）由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
        cleaner = DataCleaner()
        fin = gzip.open('events.csv.gz')
        fin.readline()#skip header
        nevents = len(programEntities.eventIndex)
        print(nevents)#13418
        self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )
        self.eventContMatrix = ss.dok_matrix( (nevents, 100) )
        ln = 0
        for line in fin:
            #if ln > 10:
                #break
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
                for j in range(9, 109):
                    self.eventContMatrix[i, j-9] = cols[j]
     
                 
            ln += 1
        fin.close()
         
        self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
        self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
         
        #calculate similarity between event pairs based on the two matrices
        self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
        self.eventContSim = ss.dok_matrix( (nevents, nevents) )
        for e1, e2 in programEntities.uniqueEventPairs:
            i = programEntities.eventIndex[e1]
            j = programEntities.eventIndex[e2]
            if not ((i, j) in self.eventPropSim):
                epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
                if np.isnan(epsim):
                    epsim = 0
                self.eventPropSim[i, j] = epsim
                self.eventPropSim[j, i] = epsim
                 
            if not ((i, j) in self.eventContSim):
                #两个向量，如果某个全为0，会返回nan
                """
                import numpy as np
                a = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])
                b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
                 
                from scipy.spatial.distance import cosine
                temp = cosine(a, b)
                会出现下面问题：
                Warning (from warnings module):
                File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644
                dist = 1.0 - uv / np.sqrt(uu * vv)
                RuntimeWarning: invalid value encountered in double_scalars
                 
                """
                ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
                if np.isnan(ecsim):
                    ecsim = 0
                self.eventContSim[i, j] = ecsim
                self.eventContSim[j, i] = ecsim
                 
        sio.mmwrite('EV_eventPropSim', self.eventPropSim)
        sio.mmwrite('EV_eventContSim', self.eventContSim)
#第五步
class EventAttendees:
    """
    统计某个活动，参加和不参加的人数，从而为活动活跃度做准备
    """
    def __init__(self, programEntities):
        nevents = len(programEntities.eventIndex)#13418  事件的总数
        self.eventPopularity = ss.dok_matrix( (nevents, 1) )
        f = gzip.open('event_attendees.csv.gz')
        f.readline()#skip header
        for line in f:
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人数-no人数，即出席人数减未出席人数
        f.close()
         
        self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)
        sio.mmwrite('EA_eventPopularity', self.eventPopularity)
         
    
def data_prepare():
    """
    计算生成所有的数据，用矩阵或者其他形式存储方便后续提取特征和建模
    """
    print('第1步：统计user和event相关信息...')
    pe = ProgramEntities()
    print('第1步完成...\n')
 
    print('第2步：计算用户相似度信息，并用矩阵形式存储...')
    Users(pe)
    print('第2步完成...\n')
 
    print('第3步：计算用户社交关系信息，并存储...')
    UserFriends(pe)
    print('第3步完成...\n')
 
    print('第4步：计算event相似度信息，并用矩阵形式存储...')
    Events(pe)
    print('第4步完成...\n')
 
    print('第5步：计算event热度信息...')
    EventAttendees(pe)
    print('第5步完成...\n')
 
 
#运行进行数据准备
data_prepare()

6.特征构建

#这是特征构建部分
 
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.io as sio
import numpy as np
 
class DataRewriter:
    def __init__(self):
        #读入数据做初始化
        self.userIndex = cPickle.load( open('PE_userIndex.pkl','rb') )
        self.eventIndex = cPickle.load( open('PE_eventIndex.pkl', 'rb') )
        self.userEventScores = sio.mmread('PE_userEventScores').todense()
        self.userSimMatrix = sio.mmread('US_userSimMatrix').todense()  
        self.eventPropSim = sio.mmread('EV_eventPropSim').todense() 
        self.eventContSim = sio.mmread('EV_eventContSim').todense()  
        self.numFriends = sio.mmread('UF_numFriends')
        self.userFriends = sio.mmread('UF_userFriends').todense()
        self.eventPopularity = sio.mmread('EA_eventPopularity').todense()
         
     
    def userReco(self, userId, eventId):
        """
        根据User-based协同过滤，得到event的推荐度
        基本的伪代码思路如下：
                for item in i
                        for every other user v that has a preference for i
                                compute similarity s between u and v
                                incorporate v's preference for i weighted by s into running average
                return top items ranked by weighted average
         
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        vs = self.userEventScores[:, j]
        sims = self.userSimMatrix[i, :]
        prod = sims * vs
        try:
            return prod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            return 0
         
    def eventReco(self, userId, eventId):
        """
        根据基于物品的协同过滤，得到Event的推荐度
        基本的伪代码思路：
        for item i:
            for every item j that u has a preference for
                compute similarity s between i and j
                add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i, :]
        psim = self.eventPropSim[:, j]
        csim = self.eventContSim[:, j]
        pprod = js * psim
        cprod = js * csim
        pscore = 0
        cscore = 0
        try:
            pscore = pprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
         
        try:
            cscore = cprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
         
        return pscore, cscore
     
    def userPop(self, userId):
        """
        基于用户的朋友个数来推断用户的社交程度
        主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
        """
         
        if userId in self.userIndex:
            i = self.userIndex[userId]
            try:
                return self.numFriends[0, i]
            except IndexError:
                return 0
        else:
            return 0
         
    def friendInfluence(self, userId):
        """
        朋友对用户的影响
        主要考虑用户的所有朋友中，有多少是非常喜欢参加各种社交活动(event)的
        用户的朋友圈如果都是积极参加各种event，可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        #下面的一行代码是不是有问题呢？
        #是不是应该为某个用户的所有朋友的兴趣分之和，然后除以nusers，也就是axis应该=1
        return (self.userFriends[i, :].sum(axis=0) / nusers)[0, 0]
     
    def eventPop(self, eventId):
        """
        活动本身的热度
        主要通过参与的参数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i, 0]
     
    def rewriteData(self, start=1, train=True, header=True):
        """
        把前面user-based协同过滤和item-based协同过滤以及各种热度和影响度作为特征组合在一起
        生成新的train，用于分类器分类使用
        """
        fn = 'train.csv' if train else 'test.csv'
        fin = open(fn)
        fout = open('data_' + fn, 'w')
        #write output header
        if header:
            ocolnames = ['invited', 'user_reco', 'evt_p_reco', 'evt_c_reco', 'user_pop', 'frnd_infl', 'evt_pop']
            if train:
                ocolnames.append('interested')
                ocolnames.append('not_interested')
            fout.write( ','.join(ocolnames) + '\n' )
 
        ln = 0
        for line in fin:
            ln += 1
            if ln < start:
                continue
            cols = line.strip().split(',')
            #user,event,invited,timestamp,interested,not_interested
            userId = cols[0]
            eventId = cols[1]
            invited = cols[2]
            if ln % 500 == 0:
                print("%s : %d (userId, eventId) = (%s, %s)" % (fn, ln, userId, eventId))
                 
            user_reco = self.userReco( userId, eventId )
            evt_p_reco, evt_c_reco = self.eventReco( userId, eventId )
            user_pop = self.userPop( userId )
            frnd_infl = self.friendInfluence( userId )
            evt_pop = self.eventPop( eventId )
            ocols = [invited, user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]
             
            if train:
                ocols.append( cols[4] )#interested
                ocols.append( cols[5] )#not_interested
                 
            fout.write(','.join( map(lambda x: str(x), ocols)) + '\n')
             
        fin.close()
        fout.close()
         
    def rewriteTrainingSet(self):
        self.rewriteData(True)
 
    def rewriteTestSet(self):
        self.rewriteData(False)
 
dr = DataRewriter()
print('生成训练数据...\n')
dr.rewriteData(train=True, start=2, header=True)
 
print('生成预测数据...\n')
dr.rewriteData(train=False, start=2, header=True)
print('done')

第七步：模型构建与预测

import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
 
def train():
    """
    在我们得到的特征上训练分类器，target为1（感兴趣），或者是0（不感兴趣）
    """
    trainDf = pd.read_csv('data_train.csv')
    X = np.matrix( pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
                    'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
    y = np.array(trainDf.interested)
     
    clf = SGDClassifier(loss='log', penalty='l2')
    clf.fit(X, y)
    return clf
 
def validate():
    """
    10折的交叉验证，并输出交叉验证的平均准确率
    """
    trainDf = pd.read_csv('data_train.csv')
    X = np.matrix(pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
                    'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
    y = np.array(trainDf.interested)
     
    nrows = len(trainDf)
    kfold = KFold(n_splits=10,shuffle=False)
    avgAccuracy = 0
    run = 0
    for train, test in kfold.split(X, y):
        Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
        clf = SGDClassifier(loss='log', penalty='l2')
        clf.fit(Xtrain, ytrain)
        accuracy = 0
        ntest = len(ytest)
        for i in range(0, ntest):
            yt = clf.predict(Xtest[i, :])
            if yt == ytest[i]:
                accuracy += 1
                 
        accuracy = accuracy / ntest
        print('accuracy(run %d) : %f' % (run, accuracy) )
         
def test(clf):
    """
    读取test数据，用分类器完成预测
    """
    origTestDf = pd.read_csv("test.csv")
    users = origTestDf.user
    events = origTestDf.event
     
    testDf = pd.read_csv("data_test.csv")
    fout = open("result.csv", 'w')
    fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
     
    nrows = len(testDf)
    Xp = np.matrix(testDf)
    yp = np.zeros((nrows, 2))
     
    for i in range(0, nrows):
        xp = Xp[i, :]
        yp[i, 0] = clf.predict(xp)
        yp[i, 1] = clf.decision_function(xp)
        fout.write(",".join( map( lambda x: str(x), [users[i], events[i], yp[i, 0], yp[i, 1]] ) ) + "\n")
    fout.close()
         
clf = train()
validate()
test(clf)
print('done')