Event Recommendation Engine Challenge(基础版)---代码

第一步:统计user和event相关信息

#查看train_csv的数据
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train.head()
usereventinvitedtimestampinterestednot_interested
03044012191877122502012-10-02 15:53:05.754000+00:0000
13044012150228424802012-10-02 15:53:05.754000+00:0000
23044012252907243202012-10-02 15:53:05.754000+00:0010
33044012307247828002012-10-02 15:53:05.754000+00:0000
43044012139070737702012-10-02 15:53:05.754000+00:0000
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15398 entries, 0 to 15397
Data columns (total 6 columns):
user              15398 non-null int64
event             15398 non-null int64
invited           15398 non-null int64
timestamp         15398 non-null object
interested        15398 non-null int64
not_interested    15398 non-null int64
dtypes: int64(5), object(1)
memory usage: 721.9+ KB
#查看test_csv的数据
df_test = pd.read_csv('test.csv')
df_test.head()
usereventinvitedtimestamp
01776192287750168802012-11-30 11:39:01.230000+00:00
11776192302544432802012-11-30 11:39:01.230000+00:00
21776192407821828502012-11-30 11:39:01.230000+00:00
31776192102402512102012-11-30 11:39:01.230000+00:00
41776192297242892802012-11-30 11:39:21.985000+00:00
df_test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 4 columns):
user         10237 non-null int64
event        10237 non-null int64
invited      10237 non-null int64
timestamp    10237 non-null object
dtypes: int64(3), object(1)
memory usage: 320.0+ KB
  • 前两列是用户ID和对应的event ID
  • 而test.csv中用户缺少了标签(interested or not_interested)
#第一步的全部程序如下
from collections import defaultdict
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle

#用于分析train和test中用户和事件之间的相关性。
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
    经过统计:train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )  #统计所有的用户有哪些
                uniqueEvents.add( cols[1] ) #统计所有的事件有哪些
                eventsForUser[cols[0]].add( cols[1] )  #将用户作为键值,保存下每个用户对应的事件
                usersForEvent[cols[1]].add( cols[0] )  #将事件作为键值,保存下每个事件对应的用户
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        #查找关联用户
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        #查找关联事件
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
 
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
第1步:统计user和event相关信息...
第1步完成...
pe.userEventScores
<3391x13418 sparse matrix of type '<class 'numpy.float64'>'
	with 4645 stored elements in Dictionary Of Keys format>

说明:

  • 其中PE_userEventScores.mtx是所有users和events的矩阵,但是里面的值只有train.csv的值,值是1或者-1
  • scipy.sparse.dok_matrix()函数是产生一个稀疏矩阵,这样PE_userEventScores.mtx只保存了非0值
  • 针对该步使用的变量作简单介绍:
    • uniqueUsers:集合,保存train.csv和test.csv中的所有user ID
    • uniqueEvents:集合,保存train.csv和test.csv中的所有event ID
    • eventsForUser:字典,key为每个用户,value为该用户对应的event集合
    • usersForEvent:字典,key为每个event,value为该event对应的user集合
    • userIndex:字典,每个用户有个Index
    • eventIndex:字典,每个event有个Index
    • userEventScores:稀疏矩阵3391 * 13418,use vs event,矩阵元素为train.csv中
      每个user对某个event的兴趣分(1, 0 or -1)即interested - not_interested
import pandas as pd
pd.DataFrame(userEventScores)

image.png
userEventScores:每个user对每个event的兴趣分(1, 0 or -1)

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['event']==1502284248]
import itertools
for each in itertools.combinations(set([3044012,1302145719,3194014105,3669515588]), 2):
    print(each)
(3194014105, 3669515588)
(3194014105, 3044012)
(3194014105, 1302145719)
(3669515588, 3044012)
(3669515588, 1302145719)
(3044012, 1302145719)

image.png

uniqueUserPairs:集合,如果对于同一个event来说,关联上3个及3个以上users,则该event关联上的users进行两两配对,保存在uniqueUserPairs中,注意保存的是userId,而不是user对应的索引:

import pandas as pd
df_train = pd.read_csv('train.csv')
df_train[df_train['user']==3044012]
 
usereventinvitedtimestampinterestednot_interested
03044012191877122502012-10-02 15:53:05.754000+00:0000
13044012150228424802012-10-02 15:53:05.754000+00:0000
23044012252907243202012-10-02 15:53:05.754000+00:0010
33044012307247828002012-10-02 15:53:05.754000+00:0000
43044012139070737702012-10-02 15:53:05.754000+00:0000
53044012153237776102012-10-02 15:53:05.754000+00:0000
import itertools
for each in itertools.combinations(set([1918771225,1502284248,2529072432, 3072478280, 1390707377, 1532377761    ]), 2):
    print(each)
(1532377761, 3072478280)
(1532377761, 2529072432)
(1532377761, 1390707377)
(1532377761, 1502284248)
(1532377761, 1918771225)
(3072478280, 2529072432)
(3072478280, 1390707377)
(3072478280, 1502284248)
(3072478280, 1918771225)
(2529072432, 1390707377)
(2529072432, 1502284248)
(2529072432, 1918771225)
(1390707377, 1502284248)
(1390707377, 1918771225)
(1502284248, 1918771225)

image.png

第二步:计算用户相似度

由于用到:users.csv,我们先看看其内容(看前10行)

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users.head(10)
user_idlocalebirthyeargenderjoinedAtlocationtimezone
03197468391id_ID1993male2012-10-02T06:40:55.524ZMedan Indonesia480.0
13537982273id_ID1992male2012-09-29T18:03:12.111ZMedan Indonesia420.0
2823183725en_US1975male2012-10-06T03:14:07.149ZStratford Ontario-240.0
31872223848en_US1991female2012-11-04T08:59:43.783ZTehran Iran210.0
43429017717id_ID1995female2012-09-10T16:06:53.132ZNaN420.0
5627175141ka_GE1973female2012-11-01T09:59:17.590ZTbilisi Georgia240.0
62752000443id_ID1994male2012-10-03T05:22:17.637ZMedan Indonesia420.0
73473687777id_ID1965female2012-10-03T12:19:29.975ZMedan Indonesia420.0
82966052962id_ID1979male2012-10-31T10:11:57.668ZMedan Indonesia420.0
9264876277id_ID1988female2012-10-02T07:28:09.555ZMedan Indonesia420.0
#使用locale和pycountry模块来将字符串转换成数值
import locale
locale.locale_alias
{'a3': 'az_AZ.KOI8-C',
 'a3_az': 'az_AZ.KOI8-C',
 'a3_az.koic': 'az_AZ.KOI8-C',
 'aa_dj': 'aa_DJ.ISO8859-1',
 'aa_er': 'aa_ER.UTF-8',
 'aa_et': 'aa_ET.UTF-8',
 'af': 'af_ZA.ISO8859-1',
 'af_za': 'af_ZA.ISO8859-1',
 'am': 'am_ET.UTF-8',
 'am_et': 'am_ET.UTF-8',
 'american': 'en_US.ISO8859-1',
 'an_es': 'an_ES.ISO8859-15',
 'ar': 'ar_AA.ISO8859-6',
 'ar_aa': 'ar_AA.ISO8859-6',
 'ar_ae': 'ar_AE.ISO8859-6',
 'ar_bh': 'ar_BH.ISO8859-6',
 'ar_dz': 'ar_DZ.ISO8859-6',
 'ar_eg': 'ar_EG.ISO8859-6',
 'ar_in': 'ar_IN.UTF-8',
 'ar_iq': 'ar_IQ.ISO8859-6',
 'ar_jo': 'ar_JO.ISO8859-6',
 'ar_kw': 'ar_KW.ISO8859-6',
 'ar_lb': 'ar_LB.ISO8859-6',
 'ar_ly': 'ar_LY.ISO8859-6',
 'ar_ma': 'ar_MA.ISO8859-6',
 'ar_om': 'ar_OM.ISO8859-6',
 'ar_qa': 'ar_QA.ISO8859-6',
 'ar_sa': 'ar_SA.ISO8859-6',
 'ar_sd': 'ar_SD.ISO8859-6',
 'ar_sy': 'ar_SY.ISO8859-6',
 'ar_tn': 'ar_TN.ISO8859-6',
 'ar_ye': 'ar_YE.ISO8859-6',
 'arabic': 'ar_AA.ISO8859-6',
 'as': 'as_IN.UTF-8',
 'as_in': 'as_IN.UTF-8',
 'ast_es': 'ast_ES.ISO8859-15',
 'ayc_pe': 'ayc_PE.UTF-8',
 'az': 'az_AZ.ISO8859-9E',
 'az_az': 'az_AZ.ISO8859-9E',
 'az_az.iso88599e': 'az_AZ.ISO8859-9E',
 'be': 'be_BY.CP1251',
 'be@latin': 'be_BY.UTF-8@latin',
 'be_bg.utf8': 'bg_BG.UTF-8',
 'be_by': 'be_BY.CP1251',
 'be_by@latin': 'be_BY.UTF-8@latin',
 'bem_zm': 'bem_ZM.UTF-8',
 'ber_dz': 'ber_DZ.UTF-8',
 'ber_ma': 'ber_MA.UTF-8',
 'bg': 'bg_BG.CP1251',
 'bg_bg': 'bg_BG.CP1251',
 'bho_in': 'bho_IN.UTF-8',
 'bn_bd': 'bn_BD.UTF-8',
 'bn_in': 'bn_IN.UTF-8',
 'bo_cn': 'bo_CN.UTF-8',
 'bo_in': 'bo_IN.UTF-8',
 'bokmal': 'nb_NO.ISO8859-1',
 'bokmål': 'nb_NO.ISO8859-1',
 'br': 'br_FR.ISO8859-1',
 'br_fr': 'br_FR.ISO8859-1',
 'brx_in': 'brx_IN.UTF-8',
 'bs': 'bs_BA.ISO8859-2',
 'bs_ba': 'bs_BA.ISO8859-2',
 'bulgarian': 'bg_BG.CP1251',
 'byn_er': 'byn_ER.UTF-8',
 'c': 'C',
 'c-french': 'fr_CA.ISO8859-1',
 'c.ascii': 'C',
 'c.en': 'C',
 'c.iso88591': 'en_US.ISO8859-1',
 'c.utf8': 'en_US.UTF-8',
 'c_c': 'C',
 'c_c.c': 'C',
 'ca': 'ca_ES.ISO8859-1',
 'ca_ad': 'ca_AD.ISO8859-1',
 'ca_es': 'ca_ES.ISO8859-1',
 'ca_es@valencia': 'ca_ES.ISO8859-15@valencia',
 'ca_fr': 'ca_FR.ISO8859-1',
 'ca_it': 'ca_IT.ISO8859-1',
 'catalan': 'ca_ES.ISO8859-1',
 'cextend': 'en_US.ISO8859-1',
 'chinese-s': 'zh_CN.eucCN',
 'chinese-t': 'zh_TW.eucTW',
 'crh_ua': 'crh_UA.UTF-8',
 'croatian': 'hr_HR.ISO8859-2',
 'cs': 'cs_CZ.ISO8859-2',
 'cs_cs': 'cs_CZ.ISO8859-2',
 'cs_cz': 'cs_CZ.ISO8859-2',
 'csb_pl': 'csb_PL.UTF-8',
 'cv_ru': 'cv_RU.UTF-8',
 'cy': 'cy_GB.ISO8859-1',
 'cy_gb': 'cy_GB.ISO8859-1',
 'cz': 'cs_CZ.ISO8859-2',
 'cz_cz': 'cs_CZ.ISO8859-2',
 'czech': 'cs_CZ.ISO8859-2',
 'da': 'da_DK.ISO8859-1',
 'da_dk': 'da_DK.ISO8859-1',
 'danish': 'da_DK.ISO8859-1',
 'dansk': 'da_DK.ISO8859-1',
 'de': 'de_DE.ISO8859-1',
 'de_at': 'de_AT.ISO8859-1',
 'de_be': 'de_BE.ISO8859-1',
 'de_ch': 'de_CH.ISO8859-1',
 'de_de': 'de_DE.ISO8859-1',
 'de_li.utf8': 'de_LI.UTF-8',
 'de_lu': 'de_LU.ISO8859-1',
 'deutsch': 'de_DE.ISO8859-1',
 'doi_in': 'doi_IN.UTF-8',
 'dutch': 'nl_NL.ISO8859-1',
 'dutch.iso88591': 'nl_BE.ISO8859-1',
 'dv_mv': 'dv_MV.UTF-8',
 'dz_bt': 'dz_BT.UTF-8',
 'ee': 'ee_EE.ISO8859-4',
 'ee_ee': 'ee_EE.ISO8859-4',
 'eesti': 'et_EE.ISO8859-1',
 'el': 'el_GR.ISO8859-7',
 'el_cy': 'el_CY.ISO8859-7',
 'el_gr': 'el_GR.ISO8859-7',
 'el_gr@euro': 'el_GR.ISO8859-15',
 'en': 'en_US.ISO8859-1',
 'en_ag': 'en_AG.UTF-8',
 'en_au': 'en_AU.ISO8859-1',
 'en_be': 'en_BE.ISO8859-1',
 'en_bw': 'en_BW.ISO8859-1',
 'en_ca': 'en_CA.ISO8859-1',
 'en_dk': 'en_DK.ISO8859-1',
 'en_dl.utf8': 'en_DL.UTF-8',
 'en_gb': 'en_GB.ISO8859-1',
 'en_hk': 'en_HK.ISO8859-1',
 'en_ie': 'en_IE.ISO8859-1',
 'en_in': 'en_IN.ISO8859-1',
 'en_ng': 'en_NG.UTF-8',
 'en_nz': 'en_NZ.ISO8859-1',
 'en_ph': 'en_PH.ISO8859-1',
 'en_sg': 'en_SG.ISO8859-1',
 'en_uk': 'en_GB.ISO8859-1',
 'en_us': 'en_US.ISO8859-1',
 'en_us@euro@euro': 'en_US.ISO8859-15',
 'en_za': 'en_ZA.ISO8859-1',
 'en_zm': 'en_ZM.UTF-8',
 'en_zw': 'en_ZW.ISO8859-1',
 'en_zw.utf8': 'en_ZS.UTF-8',
 'eng_gb': 'en_GB.ISO8859-1',
 'english': 'en_EN.ISO8859-1',
 'english_uk': 'en_GB.ISO8859-1',
 'english_united-states': 'en_US.ISO8859-1',
 'english_united-states.437': 'C',
 'english_us': 'en_US.ISO8859-1',
 'eo': 'eo_XX.ISO8859-3',
 'eo.utf8': 'eo.UTF-8',
 'eo_eo': 'eo_EO.ISO8859-3',
 'eo_us.utf8': 'eo_US.UTF-8',
 'eo_xx': 'eo_XX.ISO8859-3',
 'es': 'es_ES.ISO8859-1',
 'es_ar': 'es_AR.ISO8859-1',
 'es_bo': 'es_BO.ISO8859-1',
 'es_cl': 'es_CL.ISO8859-1',
 'es_co': 'es_CO.ISO8859-1',
 'es_cr': 'es_CR.ISO8859-1',
 'es_cu': 'es_CU.UTF-8',
 'es_do': 'es_DO.ISO8859-1',
 'es_ec': 'es_EC.ISO8859-1',
 'es_es': 'es_ES.ISO8859-1',
 'es_gt': 'es_GT.ISO8859-1',
 'es_hn': 'es_HN.ISO8859-1',
 'es_mx': 'es_MX.ISO8859-1',
 'es_ni': 'es_NI.ISO8859-1',
 'es_pa': 'es_PA.ISO8859-1',
 'es_pe': 'es_PE.ISO8859-1',
 'es_pr': 'es_PR.ISO8859-1',
 'es_py': 'es_PY.ISO8859-1',
 'es_sv': 'es_SV.ISO8859-1',
 'es_us': 'es_US.ISO8859-1',
 'es_uy': 'es_UY.ISO8859-1',
 'es_ve': 'es_VE.ISO8859-1',
 'estonian': 'et_EE.ISO8859-1',
 'et': 'et_EE.ISO8859-15',
 'et_ee': 'et_EE.ISO8859-15',
 'eu': 'eu_ES.ISO8859-1',
 'eu_es': 'eu_ES.ISO8859-1',
 'eu_fr': 'eu_FR.ISO8859-1',
 'fa': 'fa_IR.UTF-8',
 'fa_ir': 'fa_IR.UTF-8',
 'fa_ir.isiri3342': 'fa_IR.ISIRI-3342',
 'ff_sn': 'ff_SN.UTF-8',
 'fi': 'fi_FI.ISO8859-15',
 'fi_fi': 'fi_FI.ISO8859-15',
 'fil_ph': 'fil_PH.UTF-8',
 'finnish': 'fi_FI.ISO8859-1',
 'fo': 'fo_FO.ISO8859-1',
 'fo_fo': 'fo_FO.ISO8859-1',
 'fr': 'fr_FR.ISO8859-1',
 'fr_be': 'fr_BE.ISO8859-1',
 'fr_ca': 'fr_CA.ISO8859-1',
 'fr_ch': 'fr_CH.ISO8859-1',
 'fr_fr': 'fr_FR.ISO8859-1',
 'fr_lu': 'fr_LU.ISO8859-1',
 'français': 'fr_FR.ISO8859-1',
 'fre_fr': 'fr_FR.ISO8859-1',
 'french': 'fr_FR.ISO8859-1',
 'french.iso88591': 'fr_CH.ISO8859-1',
 'french_france': 'fr_FR.ISO8859-1',
 'fur_it': 'fur_IT.UTF-8',
 'fy_de': 'fy_DE.UTF-8',
 'fy_nl': 'fy_NL.UTF-8',
 'ga': 'ga_IE.ISO8859-1',
 'ga_ie': 'ga_IE.ISO8859-1',
 'galego': 'gl_ES.ISO8859-1',
 'galician': 'gl_ES.ISO8859-1',
 'gd': 'gd_GB.ISO8859-1',
 'gd_gb': 'gd_GB.ISO8859-1',
 'ger_de': 'de_DE.ISO8859-1',
 'german': 'de_DE.ISO8859-1',
 'german.iso88591': 'de_CH.ISO8859-1',
 'german_germany': 'de_DE.ISO8859-1',
 'gez_er': 'gez_ER.UTF-8',
 'gez_et': 'gez_ET.UTF-8',
 'gl': 'gl_ES.ISO8859-1',
 'gl_es': 'gl_ES.ISO8859-1',
 'greek': 'el_GR.ISO8859-7',
 'gu_in': 'gu_IN.UTF-8',
 'gv': 'gv_GB.ISO8859-1',
 'gv_gb': 'gv_GB.ISO8859-1',
 'ha_ng': 'ha_NG.UTF-8',
 'he': 'he_IL.ISO8859-8',
 'he_il': 'he_IL.ISO8859-8',
 'hebrew': 'he_IL.ISO8859-8',
 'hi': 'hi_IN.ISCII-DEV',
 'hi_in': 'hi_IN.ISCII-DEV',
 'hi_in.isciidev': 'hi_IN.ISCII-DEV',
 'hne': 'hne_IN.UTF-8',
 'hne_in': 'hne_IN.UTF-8',
 'hr': 'hr_HR.ISO8859-2',
 'hr_hr': 'hr_HR.ISO8859-2',
 'hrvatski': 'hr_HR.ISO8859-2',
 'hsb_de': 'hsb_DE.ISO8859-2',
 'ht_ht': 'ht_HT.UTF-8',
 'hu': 'hu_HU.ISO8859-2',
 'hu_hu': 'hu_HU.ISO8859-2',
 'hungarian': 'hu_HU.ISO8859-2',
 'hy_am': 'hy_AM.UTF-8',
 'hy_am.armscii8': 'hy_AM.ARMSCII_8',
 'ia': 'ia.UTF-8',
 'ia_fr': 'ia_FR.UTF-8',
 'icelandic': 'is_IS.ISO8859-1',
 'id': 'id_ID.ISO8859-1',
 'id_id': 'id_ID.ISO8859-1',
 'ig_ng': 'ig_NG.UTF-8',
 'ik_ca': 'ik_CA.UTF-8',
 'in': 'id_ID.ISO8859-1',
 'in_id': 'id_ID.ISO8859-1',
 'is': 'is_IS.ISO8859-1',
 'is_is': 'is_IS.ISO8859-1',
 'iso-8859-1': 'en_US.ISO8859-1',
 'iso-8859-15': 'en_US.ISO8859-15',
 'iso8859-1': 'en_US.ISO8859-1',
 'iso8859-15': 'en_US.ISO8859-15',
 'iso_8859_1': 'en_US.ISO8859-1',
 'iso_8859_15': 'en_US.ISO8859-15',
 'it': 'it_IT.ISO8859-1',
 'it_ch': 'it_CH.ISO8859-1',
 'it_it': 'it_IT.ISO8859-1',
 'italian': 'it_IT.ISO8859-1',
 'iu': 'iu_CA.NUNACOM-8',
 'iu_ca': 'iu_CA.NUNACOM-8',
 'iu_ca.nunacom8': 'iu_CA.NUNACOM-8',
 'iw': 'he_IL.ISO8859-8',
 'iw_il': 'he_IL.ISO8859-8',
 'iw_il.utf8': 'iw_IL.UTF-8',
 'ja': 'ja_JP.eucJP',
 'ja_jp': 'ja_JP.eucJP',
 'ja_jp.euc': 'ja_JP.eucJP',
 'ja_jp.mscode': 'ja_JP.SJIS',
 'ja_jp.pck': 'ja_JP.SJIS',
 'japan': 'ja_JP.eucJP',
 'japanese': 'ja_JP.eucJP',
 'japanese-euc': 'ja_JP.eucJP',
 'japanese.euc': 'ja_JP.eucJP',
 'jp_jp': 'ja_JP.eucJP',
 'ka': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge.georgianacademy': 'ka_GE.GEORGIAN-ACADEMY',
 'ka_ge.georgianps': 'ka_GE.GEORGIAN-PS',
 'ka_ge.georgianrs': 'ka_GE.GEORGIAN-ACADEMY',
 'kk_kz': 'kk_KZ.RK1048',
 'kl': 'kl_GL.ISO8859-1',
 'kl_gl': 'kl_GL.ISO8859-1',
 'km_kh': 'km_KH.UTF-8',
 'kn': 'kn_IN.UTF-8',
 'kn_in': 'kn_IN.UTF-8',
 'ko': 'ko_KR.eucKR',
 'ko_kr': 'ko_KR.eucKR',
 'ko_kr.euc': 'ko_KR.eucKR',
 'kok_in': 'kok_IN.UTF-8',
 'korean': 'ko_KR.eucKR',
 'korean.euc': 'ko_KR.eucKR',
 'ks': 'ks_IN.UTF-8',
 'ks_in': 'ks_IN.UTF-8',
 'ks_in@devanagari.utf8': 'ks_IN.UTF-8@devanagari',
 'ku_tr': 'ku_TR.ISO8859-9',
 'kw': 'kw_GB.ISO8859-1',
 'kw_gb': 'kw_GB.ISO8859-1',
 'ky': 'ky_KG.UTF-8',
 'ky_kg': 'ky_KG.UTF-8',
 'lb_lu': 'lb_LU.UTF-8',
 'lg_ug': 'lg_UG.ISO8859-10',
 'li_be': 'li_BE.UTF-8',
 'li_nl': 'li_NL.UTF-8',
 'lij_it': 'lij_IT.UTF-8',
 'lithuanian': 'lt_LT.ISO8859-13',
 'lo': 'lo_LA.MULELAO-1',
 'lo_la': 'lo_LA.MULELAO-1',
 'lo_la.cp1133': 'lo_LA.IBM-CP1133',
 'lo_la.ibmcp1133': 'lo_LA.IBM-CP1133',
 'lo_la.mulelao1': 'lo_LA.MULELAO-1',
 'lt': 'lt_LT.ISO8859-13',
 'lt_lt': 'lt_LT.ISO8859-13',
 'lv': 'lv_LV.ISO8859-13',
 'lv_lv': 'lv_LV.ISO8859-13',
 'mag_in': 'mag_IN.UTF-8',
 'mai': 'mai_IN.UTF-8',
 'mai_in': 'mai_IN.UTF-8',
 'mg_mg': 'mg_MG.ISO8859-15',
 'mhr_ru': 'mhr_RU.UTF-8',
 'mi': 'mi_NZ.ISO8859-1',
 'mi_nz': 'mi_NZ.ISO8859-1',
 'mk': 'mk_MK.ISO8859-5',
 'mk_mk': 'mk_MK.ISO8859-5',
 'ml': 'ml_IN.UTF-8',
 'ml_in': 'ml_IN.UTF-8',
 'mn_mn': 'mn_MN.UTF-8',
 'mni_in': 'mni_IN.UTF-8',
 'mr': 'mr_IN.UTF-8',
 'mr_in': 'mr_IN.UTF-8',
 'ms': 'ms_MY.ISO8859-1',
 'ms_my': 'ms_MY.ISO8859-1',
 'mt': 'mt_MT.ISO8859-3',
 'mt_mt': 'mt_MT.ISO8859-3',
 'my_mm': 'my_MM.UTF-8',
 'nan_tw@latin': 'nan_TW.UTF-8@latin',
 'nb': 'nb_NO.ISO8859-1',
 'nb_no': 'nb_NO.ISO8859-1',
 'nds_de': 'nds_DE.UTF-8',
 'nds_nl': 'nds_NL.UTF-8',
 'ne_np': 'ne_NP.UTF-8',
 'nhn_mx': 'nhn_MX.UTF-8',
 'niu_nu': 'niu_NU.UTF-8',
 'niu_nz': 'niu_NZ.UTF-8',
 'nl': 'nl_NL.ISO8859-1',
 'nl_aw': 'nl_AW.UTF-8',
 'nl_be': 'nl_BE.ISO8859-1',
 'nl_nl': 'nl_NL.ISO8859-1',
 'nn': 'nn_NO.ISO8859-1',
 'nn_no': 'nn_NO.ISO8859-1',
 'no': 'no_NO.ISO8859-1',
 'no@nynorsk': 'ny_NO.ISO8859-1',
 'no_no': 'no_NO.ISO8859-1',
 'no_no.iso88591@bokmal': 'no_NO.ISO8859-1',
 'no_no.iso88591@nynorsk': 'no_NO.ISO8859-1',
 'norwegian': 'no_NO.ISO8859-1',
 'nr': 'nr_ZA.ISO8859-1',
 'nr_za': 'nr_ZA.ISO8859-1',
 'nso': 'nso_ZA.ISO8859-15',
 'nso_za': 'nso_ZA.ISO8859-15',
 'ny': 'ny_NO.ISO8859-1',
 'ny_no': 'ny_NO.ISO8859-1',
 'nynorsk': 'nn_NO.ISO8859-1',
 'oc': 'oc_FR.ISO8859-1',
 'oc_fr': 'oc_FR.ISO8859-1',
 'om_et': 'om_ET.UTF-8',
 'om_ke': 'om_KE.ISO8859-1',
 'or': 'or_IN.UTF-8',
 'or_in': 'or_IN.UTF-8',
 'os_ru': 'os_RU.UTF-8',
 'pa': 'pa_IN.UTF-8',
 'pa_in': 'pa_IN.UTF-8',
 'pa_pk': 'pa_PK.UTF-8',
 'pap_an': 'pap_AN.UTF-8',
 'pd': 'pd_US.ISO8859-1',
 'pd_de': 'pd_DE.ISO8859-1',
 'pd_us': 'pd_US.ISO8859-1',
 'ph': 'ph_PH.ISO8859-1',
 'ph_ph': 'ph_PH.ISO8859-1',
 'pl': 'pl_PL.ISO8859-2',
 'pl_pl': 'pl_PL.ISO8859-2',
 'polish': 'pl_PL.ISO8859-2',
 'portuguese': 'pt_PT.ISO8859-1',
 'portuguese_brazil': 'pt_BR.ISO8859-1',
 'posix': 'C',
 'posix-utf2': 'C',
 'pp': 'pp_AN.ISO8859-1',
 'pp_an': 'pp_AN.ISO8859-1',
 'ps_af': 'ps_AF.UTF-8',
 'pt': 'pt_PT.ISO8859-1',
 'pt_br': 'pt_BR.ISO8859-1',
 'pt_pt': 'pt_PT.ISO8859-1',
 'ro': 'ro_RO.ISO8859-2',
 'ro_ro': 'ro_RO.ISO8859-2',
 'romanian': 'ro_RO.ISO8859-2',
 'ru': 'ru_RU.UTF-8',
 'ru_ru': 'ru_RU.UTF-8',
 'ru_ua': 'ru_UA.KOI8-U',
 'rumanian': 'ro_RO.ISO8859-2',
 'russian': 'ru_RU.ISO8859-5',
 'rw': 'rw_RW.ISO8859-1',
 'rw_rw': 'rw_RW.ISO8859-1',
 'sa_in': 'sa_IN.UTF-8',
 'sat_in': 'sat_IN.UTF-8',
 'sc_it': 'sc_IT.UTF-8',
 'sd': 'sd_IN.UTF-8',
 'sd_in': 'sd_IN.UTF-8',
 'sd_in@devanagari.utf8': 'sd_IN.UTF-8@devanagari',
 'sd_pk': 'sd_PK.UTF-8',
 'se_no': 'se_NO.UTF-8',
 'serbocroatian': 'sr_RS.UTF-8@latin',
 'sh': 'sr_RS.UTF-8@latin',
 'sh_ba.iso88592@bosnia': 'sr_CS.ISO8859-2',
 'sh_hr': 'sh_HR.ISO8859-2',
 'sh_hr.iso88592': 'hr_HR.ISO8859-2',
 'sh_sp': 'sr_CS.ISO8859-2',
 'sh_yu': 'sr_RS.UTF-8@latin',
 'shs_ca': 'shs_CA.UTF-8',
 'si': 'si_LK.UTF-8',
 'si_lk': 'si_LK.UTF-8',
 'sid_et': 'sid_ET.UTF-8',
 'sinhala': 'si_LK.UTF-8',
 'sk': 'sk_SK.ISO8859-2',
 'sk_sk': 'sk_SK.ISO8859-2',
 'sl': 'sl_SI.ISO8859-2',
 'sl_cs': 'sl_CS.ISO8859-2',
 'sl_si': 'sl_SI.ISO8859-2',
 'slovak': 'sk_SK.ISO8859-2',
 'slovene': 'sl_SI.ISO8859-2',
 'slovenian': 'sl_SI.ISO8859-2',
 'so_dj': 'so_DJ.ISO8859-1',
 'so_et': 'so_ET.UTF-8',
 'so_ke': 'so_KE.ISO8859-1',
 'so_so': 'so_SO.ISO8859-1',
 'sp': 'sr_CS.ISO8859-5',
 'sp_yu': 'sr_CS.ISO8859-5',
 'spanish': 'es_ES.ISO8859-1',
 'spanish_spain': 'es_ES.ISO8859-1',
 'sq': 'sq_AL.ISO8859-2',
 'sq_al': 'sq_AL.ISO8859-2',
 'sq_mk': 'sq_MK.UTF-8',
 'sr': 'sr_RS.UTF-8',
 'sr@cyrillic': 'sr_RS.UTF-8',
 'sr@latn': 'sr_CS.UTF-8@latin',
 'sr_cs': 'sr_CS.UTF-8',
 'sr_cs.iso88592@latn': 'sr_CS.ISO8859-2',
 'sr_cs@latn': 'sr_CS.UTF-8@latin',
 'sr_me': 'sr_ME.UTF-8',
 'sr_rs': 'sr_RS.UTF-8',
 'sr_rs@latn': 'sr_RS.UTF-8@latin',
 'sr_sp': 'sr_CS.ISO8859-2',
 'sr_yu': 'sr_RS.UTF-8@latin',
 'sr_yu.cp1251@cyrillic': 'sr_CS.CP1251',
 'sr_yu.iso88592': 'sr_CS.ISO8859-2',
 'sr_yu.iso88595': 'sr_CS.ISO8859-5',
 'sr_yu.iso88595@cyrillic': 'sr_CS.ISO8859-5',
 'sr_yu.microsoftcp1251@cyrillic': 'sr_CS.CP1251',
 'sr_yu.utf8': 'sr_RS.UTF-8',
 'sr_yu.utf8@cyrillic': 'sr_RS.UTF-8',
 'sr_yu@cyrillic': 'sr_RS.UTF-8',
 'ss': 'ss_ZA.ISO8859-1',
 'ss_za': 'ss_ZA.ISO8859-1',
 'st': 'st_ZA.ISO8859-1',
 'st_za': 'st_ZA.ISO8859-1',
 'sv': 'sv_SE.ISO8859-1',
 'sv_fi': 'sv_FI.ISO8859-1',
 'sv_se': 'sv_SE.ISO8859-1',
 'sw_ke': 'sw_KE.UTF-8',
 'sw_tz': 'sw_TZ.UTF-8',
 'swedish': 'sv_SE.ISO8859-1',
 'szl_pl': 'szl_PL.UTF-8',
 'ta': 'ta_IN.TSCII-0',
 'ta_in': 'ta_IN.TSCII-0',
 'ta_in.tscii': 'ta_IN.TSCII-0',
 'ta_in.tscii0': 'ta_IN.TSCII-0',
 'ta_lk': 'ta_LK.UTF-8',
 'te': 'te_IN.UTF-8',
 'te_in': 'te_IN.UTF-8',
 'tg': 'tg_TJ.KOI8-C',
 'tg_tj': 'tg_TJ.KOI8-C',
 'th': 'th_TH.ISO8859-11',
 'th_th': 'th_TH.ISO8859-11',
 'th_th.tactis': 'th_TH.TIS620',
 'th_th.tis620': 'th_TH.TIS620',
 'thai': 'th_TH.ISO8859-11',
 'ti_er': 'ti_ER.UTF-8',
 'ti_et': 'ti_ET.UTF-8',
 'tig_er': 'tig_ER.UTF-8',
 'tk_tm': 'tk_TM.UTF-8',
 'tl': 'tl_PH.ISO8859-1',
 'tl_ph': 'tl_PH.ISO8859-1',
 'tn': 'tn_ZA.ISO8859-15',
 'tn_za': 'tn_ZA.ISO8859-15',
 'tr': 'tr_TR.ISO8859-9',
 'tr_cy': 'tr_CY.ISO8859-9',
 'tr_tr': 'tr_TR.ISO8859-9',
 'ts': 'ts_ZA.ISO8859-1',
 'ts_za': 'ts_ZA.ISO8859-1',
 'tt': 'tt_RU.TATAR-CYR',
 'tt_ru': 'tt_RU.TATAR-CYR',
 'tt_ru.tatarcyr': 'tt_RU.TATAR-CYR',
 'tt_ru@iqtelif': 'tt_RU.UTF-8@iqtelif',
 'turkish': 'tr_TR.ISO8859-9',
 'ug_cn': 'ug_CN.UTF-8',
 'uk': 'uk_UA.KOI8-U',
 'uk_ua': 'uk_UA.KOI8-U',
 'univ': 'en_US.utf',
 'universal': 'en_US.utf',
 'universal.utf8@ucs4': 'en_US.UTF-8',
 'unm_us': 'unm_US.UTF-8',
 'ur': 'ur_PK.CP1256',
 'ur_in': 'ur_IN.UTF-8',
 'ur_pk': 'ur_PK.CP1256',
 'uz': 'uz_UZ.UTF-8',
 'uz_uz': 'uz_UZ.UTF-8',
 'uz_uz@cyrillic': 'uz_UZ.UTF-8',
 've': 've_ZA.UTF-8',
 've_za': 've_ZA.UTF-8',
 'vi': 'vi_VN.TCVN',
 'vi_vn': 'vi_VN.TCVN',
 'vi_vn.tcvn': 'vi_VN.TCVN',
 'vi_vn.tcvn5712': 'vi_VN.TCVN',
 'vi_vn.viscii': 'vi_VN.VISCII',
 'vi_vn.viscii111': 'vi_VN.VISCII',
 'wa': 'wa_BE.ISO8859-1',
 'wa_be': 'wa_BE.ISO8859-1',
 'wae_ch': 'wae_CH.UTF-8',
 'wal_et': 'wal_ET.UTF-8',
 'wo_sn': 'wo_SN.UTF-8',
 'xh': 'xh_ZA.ISO8859-1',
 'xh_za': 'xh_ZA.ISO8859-1',
 'yi': 'yi_US.CP1255',
 'yi_us': 'yi_US.CP1255',
 'yo_ng': 'yo_NG.UTF-8',
 'yue_hk': 'yue_HK.UTF-8',
 'zh': 'zh_CN.eucCN',
 'zh_cn': 'zh_CN.gb2312',
 'zh_cn.big5': 'zh_TW.big5',
 'zh_cn.euc': 'zh_CN.eucCN',
 'zh_hk': 'zh_HK.big5hkscs',
 'zh_hk.big5hk': 'zh_HK.big5hkscs',
 'zh_sg': 'zh_SG.GB2312',
 'zh_sg.gbk': 'zh_SG.GBK',
 'zh_tw': 'zh_TW.big5',
 'zh_tw.euc': 'zh_TW.eucTW',
 'zh_tw.euctw': 'zh_TW.eucTW',
 'zu': 'zu_ZA.ISO8859-1',
 'zu_za': 'zu_ZA.ISO8859-1'}

1.locale列处理

import locale
from collections import defaultdict
 
localeIdMap = defaultdict(int)
for i, l in enumerate(locale.locale_alias.keys()):
    localeIdMap[l] = i + 1
for each in localeIdMap:
    print(each, '\t', localeIdMap[each])
ee 	 1
fr_ch 	 2
fo_fo 	 3
af_za 	 4
bn_in 	 5
mni_in 	 93
da_dk 	 8
ar_ma 	 9
ig_ng 	 369
fr_be 	 11
italian 	 12
he_il 	 13
aa_dj 	 15
ml 	 463
yue_hk 	 17
pt_br 	 19
es_mx 	 280
gu_in 	 22
sid_et 	 23
it_it 	 24
japanese 	 95
de_de 	 26
en_ag 	 523
croatian 	 27
it 	 96
cs 	 29
mn_mn 	 30
ar_bh 	 31
ro_ro 	 481
gv_gb 	 33
rw 	 34
bg_bg 	 35
ar 	 499
en_us@euro@euro 	 36
fil_ph 	 37
fr_fr 	 466
french 	 39
de 	 40
polish 	 285
kok_in 	 42
korean.euc 	 43
sr 	 44
sr_cs.iso88592@latn 	 45
pap_an 	 46
sr_yu.iso88595 	 47
turkish 	 51
c.utf8 	 52
uz_uz 	 53
lv_lv 	 429
sr_rs@latn 	 54
eo_xx 	 55
ik_ca 	 57
iso_8859_1 	 58
no_no.iso88591@bokmal 	 59
cextend 	 60
doi_in 	 225
universal 	 61
es_cr 	 62
hne_in 	 63
gd_gb 	 64
cy 	 65
nl_aw 	 66
yi 	 67
mt_mt 	 68
sk_sk 	 384
si_lk 	 71
a3_az 	 72
lt 	 500
st_za 	 73
iw 	 74
te 	 318
en_nz 	 528
en_in 	 76
zh_tw.euc 	 77
ne_np 	 49
brx_in 	 286
no 	 80
az 	 81
german.iso88591 	 475
ky 	 32
he 	 85
kn_in 	 86
id_id 	 110
mai 	 88
nb_no 	 89
czech 	 90
sq 	 91
ja 	 92
tr 	 6
german_germany 	 94
shs_ca 	 265
mr 	 28
fi_fi 	 97
wal_et 	 48
cs_cs 	 100
sd_in@devanagari.utf8 	 101
gez_er 	 102
a3 	 103
wae_ch 	 283
iu 	 106
nl 	 107
french.iso88591 	 108
japanese-euc 	 83
tig_er 	 98
hne 	 111
c.iso88591 	 112
ar_qa 	 113
chinese-t 	 114
fo 	 115
de_li.utf8 	 117
br_fr 	 118
mag_in 	 515
sv_fi 	 119
russian 	 120
pp 	 121
wa_be 	 123
norwegian 	 124
fa_ir.isiri3342 	 126
ky_kg 	 127
zh_tw.euctw 	 128
fre_fr 	 130
english_uk 	 131
arabic 	 133
fr_ca 	 134
ber_ma 	 135
ml_in 	 136
li_nl 	 137
et 	 138
fur_it 	 139
om_ke 	 140
gl 	 141
bg 	 142
is_is 	 143
sr_yu 	 282
tk_tm 	 125
en_au 	 146
fa_ir 	 147
be_bg.utf8 	 148
zu 	 303
sh_hr.iso88592 	 150
szl_pl 	 310
ar_ae 	 152
nynorsk 	 153
en_bw 	 154
iso-8859-1 	 155
tl_ph 	 518
bulgarian 	 299
ts 	 356
kn 	 159
af 	 160
wa 	 161
or_in 	 162
dansk 	 163
bs 	 522
be@latin 	 164
lij_it 	 398
ko_kr 	 167
tr_tr 	 168
ar_in 	 169
os_ru 	 170
sr_yu@cyrillic 	 171
ta_lk 	 172
sr_rs 	 490
es_ec 	 174
en_be 	 175
no_no.iso88591@nynorsk 	 176
zh_cn.big5 	 177
pt_pt 	 178
an_es 	 179
zh_hk 	 180
es_cl 	 181
unm_us 	 312
am 	 183
as 	 184
cv_ru 	 185
ar_aa 	 186
gd 	 419
ti_er 	 187
ar_lb 	 188
sp 	 189
ja_jp.euc 	 190
csb_pl 	 191
el_gr 	 192
de_be 	 193
bokmål 	 194
danish 	 195
be_by@latin 	 196
kw 	 198
iso_8859_15 	 301
sr_yu.iso88595@cyrillic 	 199
cs_cz 	 200
tn 	 201
ar_tn 	 202
or 	 203
se_no 	 204
mhr_ru 	 495
be_by 	 206
eu_fr 	 406
de_at 	 207
tr_cy 	 104
mai_in 	 209
zu_za 	 210
sh_hr 	 211
ta_in.tscii 	 212
sr_yu.utf8 	 213
de_ch 	 214
dv_mv 	 236
mk 	 215
mt 	 216
fa 	 217
tt_ru 	 218
ga_ie 	 306
iw_il 	 219
li_be 	 220
ka_ge.georgianacademy 	 221
az_az.iso88599e 	 222
eng_gb 	 223
en_zw 	 224
en_dl.utf8 	 75
estonian 	 226
es_pa 	 227
sw_ke 	 228
es_pe 	 229
pa_pk 	 230
hebrew 	 231
niu_nu 	 232
lo_la 	 233
ca_es 	 309
sq_al 	 235
ka_ge.georgianrs 	 305
ca 	 238
tt_ru.tatarcyr 	 239
zh_hk.big5hk 	 240
nb 	 241
mg_mg 	 242
eo_eo 	 510
kl_gl 	 411
lo 	 244
iu_ca 	 245
thai 	 517
as_in 	 246
en_ng 	 313
ar_om 	 248
ia 	 249
eo_us.utf8 	 250
ur_pk 	 251
vi_vn.tcvn 	 252
ar_eg 	 253
es_py 	 254
ru_ua 	 255
nn 	 256
hr 	 504
chinese-s 	 258
sc_it 	 259
ta_in.tscii0 	 260
korean 	 261
nr_za 	 262
si 	 263
zh_sg 	 264
portuguese_brazil 	 440
bokmal 	 482
ber_dz 	 266
pa 	 316
ee_ee 	 526
american 	 268
en_za 	 269
lo_la.cp1133 	 270
pa_in 	 271
en_uk 	 272
sat_in 	 273
so_so 	 274
finnish 	 275
cy_gb 	 277
mi_nz 	 278
gez_et 	 279
german 	 20
am_et 	 281
ko_kr.euc 	 543
es_cu 	 144
sd 	 69
ti_et 	 156
en_ca 	 506
sr_yu.microsoftcp1251@cyrillic 	 87
c.ascii 	 402
lv 	 287
ka_ge.georgianps 	 288
pl_pl 	 237
ar_kw 	 290
hrvatski 	 7
bo_in 	 292
dutch.iso88591 	 293
pd_de 	 294
in_id 	 296
ms 	 297
hsb_de 	 298
sr_yu.utf8@cyrillic 	 157
th_th.tis620 	 300
lb_lu 	 315
lg_ug 	 302
uz_uz@cyrillic 	 304
sh_sp 	 314
tg_tj 	 129
ku_tr 	 307
deutsch 	 105
ar_ly 	 536
nds_nl 	 390
my_mm 	 308
fy_nl 	 234
aa_er 	 151
kw_gb 	 311
hy_am 	 247
romanian 	 267
wo_sn 	 122
so_ke 	 320
sr_yu.iso88592 	 322
pl 	 295
sp_yu 	 324
be 	 325
et_ee 	 326
en_ie 	 328
es_do 	 329
en_sg 	 330
it_ch 	 331
bs_ba 	 332
el_gr@euro 	 333
sinhala 	 334
hu 	 335
tt_ru@iqtelif 	 336
ger_de 	 337
iu_ca.nunacom8 	 78
ph_ph 	 339
en_ph 	 469
rw_rw 	 393
so_et 	 340
ka 	 341
ur_in 	 205
hr_hr 	 343
ar_sa 	 344
french_france 	 345
sk 	 346
es_pr 	 347
galician 	 349
ff_sn 	 350
sq_mk 	 56
ny_no 	 352
ro 	 353
zh_cn 	 354
tt 	 355
nhn_mx 	 427
en_dk 	 372
ar_iq 	 358
lt_lt 	 359
dutch 	 360
slovenian 	 361
cz 	 362
nso_za 	 508
cz_cz 	 428
ss 	 364
ar_sy 	 365
en_gb 	 366
byn_er 	 367
ayc_pe 	 368
en_zw.utf8 	 338
ug_cn 	 14
es_ni 	 371
catalan 	 84
english_us 	 373
hi_in.isciidev 	 374
eu_es 	 422
ca_fr 	 375
vi_vn.tcvn5712 	 376
so_dj 	 50
nl_nl 	 378
en_zm 	 379
posix-utf2 	 380
el 	 525
lo_la.ibmcp1133 	 382
en 	 383
th_th 	 70
ka_ge 	 385
kk_kz 	 386
a3_az.koic 	 387
fr 	 388
de_lu 	 389
zh 	 21
es_gt 	 542
oc_fr 	 391
ta 	 392
sv_se 	 116
st 	 10
galego 	 395
eu 	 158
sr_sp 	 529
sr_yu.cp1251@cyrillic 	 166
es_ar 	 400
mk_mk 	 401
english_united-states.437 	 18
dz_bt 	 351
ga 	 432
en_us 	 404
ar_jo 	 405
es_uy 	 342
tl 	 407
c-french 	 408
english_united-states 	 409
en_hk 	 410
br 	 478
nso 	 243
spanish_spain 	 412
xh 	 413
yi_us 	 414
ps_af 	 415
zh_tw 	 416
bho_in 	 417
ia_fr 	 435
ss_za 	 418
gv 	 291
es_bo 	 420
eo 	 491
gl_es 	 421
ja_jp 	 319
tn_za 	 423
crh_ua 	 424
sw_tz 	 425
jp_jp 	 426
sh_ba.iso88592@bosnia 	 357
km_kh 	 363
sv 	 399
no@nynorsk 	 16
vi 	 403
hy_am.armscii8 	 433
ru_ru 	 434
univ 	 276
mr_in 	 436
ur 	 437
ht_ht 	 438
japan 	 439
sh 	 377
fr_lu 	 441
es_hn 	 442
ast_es 	 443
ta_in 	 444
sd_pk 	 445
portuguese 	 446
ts_za 	 447
mi 	 448
lithuanian 	 488
c.en 	 450
zh_cn.euc 	 321
az_az 	 452
ko 	 537
sr@latn 	 454
es_us 	 455
ny 	 456
is 	 182
iso8859-1 	 431
fy_de 	 197
oc 	 459
icelandic 	 460
es_es 	 461
greek 	 462
pp_an 	 284
da 	 464
ha_ng 	 465
ks_in@devanagari.utf8 	 38
el_cy 	 512
pd_us 	 467
th 	 468
ja_jp.pck 	 149
ru 	 470
c 	 396
ca_es@valencia 	 458
uk 	 472
rumanian 	 473
français 	 474
ja_jp.mscode 	 82
tg 	 476
es_sv 	 477
japanese.euc 	 99
ca_it 	 479
c_c.c 	 25
english 	 480
es_ve 	 394
kl 	 483
ve 	 484
sr_cs@latn 	 485
ar_dz 	 486
aa_et 	 487
bo_cn 	 109
iw_il.utf8 	 145
nn_no 	 489
vi_vn 	 173
spanish 	 79
ca_ad 	 492
vi_vn.viscii111 	 494
c_c 	 451
nan_tw@latin 	 370
ar_sd 	 498
vi_vn.viscii 	 496
ms_my 	 501
es_co 	 502
posix 	 503
niu_nz 	 257
ks 	 505
id 	 430
iso-8859-15 	 507
sd_in 	 327
es 	 509
th_th.tactis 	 41
iso8859-15 	 471
bn_bd 	 511
hu_hu 	 323
nds_de 	 513
nr 	 514
slovene 	 208
sl_si 	 516
ve_za 	 317
sh_yu 	 545
sr@cyrillic 	 519
slovak 	 521
pd 	 497
serbocroatian 	 132
ph 	 457
sa_in 	 381
fi 	 348
nl_be 	 527
sr_me 	 165
swedish 	 397
sl_cs 	 530
ar_ye 	 524
yo_ng 	 531
eesti 	 532
hungarian 	 533
no_no 	 534
hi 	 548
uz 	 535
in 	 449
om_et 	 453
sr_cs 	 538
xh_za 	 539
pt 	 541
universal.utf8@ucs4 	 520
ks_in 	 493
bem_zm 	 544
hi_in 	 289
eo.utf8 	 546
uk_ua 	 547
zh_sg.gbk 	 540
te_in 	 549
sl 	 550
lo_la.mulelao1 	 551

所以传给localeIdMap一个locale的字符串,就可以将其转换成数值型,如果传入的字符串不在localeIdMap的key中,则返回0,这也就体现了defaultdict(int)的作用

print(localeIdMap['en_GB'.lower()])
print(localeIdMap['en_US'.lower()])
print(localeIdMap['id_ID'.lower()])
print(localeIdMap['ka_GE'.lower()])
366
404
110
385

2.birthyear列处理

该列处理比较简单,存在就直接转换成数值,不存在就用0填充

def getBirthYearInt(birthYear):
    try:
        return 0 if birthYear=="None" else int(birthYear)
    except:
        return 0
print(getBirthYearInt(1992))
print(getBirthYearInt(None))
1992
0

3.gender列处理

male转换为1, female转换为2,空值用0填充

from collections import defaultdict
genderIdMap = defaultdict(int, {'male':1, 'female':2})
print(genderIdMap['male'])
print(genderIdMap['female'])
print(genderIdMap[None])
1
2
0

4.joinedAt列处理

我们发现该列信息有些共性特点:

import pandas as pd
df_users = pd.read_csv('users.csv')
df_users['joinedAt'][:10]
0    2012-10-02T06:40:55.524Z
1    2012-09-29T18:03:12.111Z
2    2012-10-06T03:14:07.149Z
3    2012-11-04T08:59:43.783Z
4    2012-09-10T16:06:53.132Z
5    2012-11-01T09:59:17.590Z
6    2012-10-03T05:22:17.637Z
7    2012-10-03T12:19:29.975Z
8    2012-10-31T10:11:57.668Z
9    2012-10-02T07:28:09.555Z
Name: joinedAt, dtype: object

我们发现该列要么是None,要么是上面的时间字符串,均有T在中间和S在尾部,根据这个共性我们用datetime模块,提取时间信息:

import datetime
def getJoinedYearMonth(dateString):
    try:
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month)] )
    except:
        return 0
df_users['joinedAt'].map(getJoinedYearMonth)[:10]
0    201210
1     20129
2    201210
3    201211
4     20129
5    201211
6    201210
7    201210
8    201210
9    201210
Name: joinedAt, dtype: object

5.location列处理

我们来看看users.csv中location列信息(前20行):

df_users['location'][:20]
0                  Medan  Indonesia
1                  Medan  Indonesia
2                Stratford  Ontario
3                      Tehran  Iran
4                               NaN
5                  Tbilisi  Georgia
6                  Medan  Indonesia
7                  Medan  Indonesia
8                  Medan  Indonesia
9                  Medan  Indonesia
10                 Medan  Indonesia
11                       Phnom Penh
12    Djokja  Yogyakarta  Indonesia
13               Triolet  Mauritius
14                              NaN
15                              NaN
16                              NaN
17              Surabaya  Indonesia
18                 Medan  Indonesia
19                              NaN
Name: location, dtype: object

我们使用pycountry模块来将此列转换为数值型,pycountry.countries是个迭代器:

import pycountry
from collections import defaultdict
countryIdMap = defaultdict(int)
for i, c in enumerate(pycountry.countries):
    countryIdMap[c.name.lower()] = i + 1
#将地址信息转换为数值型
def getCountryId(location):
    if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
        return countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
    else:
        return 0
print(getCountryId('San Dimas  California'))
print(getCountryId('Jogjakarta  Indonesia'))
0
103

我们知道许多机器学习模型只能接受数值型的数据作为模型的输入,所以在这里需要将位置信息转换为数值型的数据,常见的做法是对其做one hot处理,但是这样会造成矩阵太稀疏,我们可以使用pycountry库,对位置数据按照pycountry中存储的位置信息进行编码,使用编码来代替原始的位置信息。

6.timezone列处理

比较简单,存在值就转换为int型,不存在用0填充

def getTimezoneInt(timezone):
    try:
        return int(timezone)
    except:
        return 0
print(getTimezoneInt(-240))#-240
print(getTimezoneInt(240))
print(getTimezoneInt(None))
-240
240
0

7.将上面处理的1-6列进行归一化

self.userMatrix矩阵的处理中归一化使用了sklearn.preprocessing.normalize()函数,归一化后方便计算两个user的相似度

这里只计算Event Recommendation Engine Challenge分步解析第一步中的uniqueUserPairs,他们因为同一个event事件关联起来了,有联系

计算相关性用到了scipy.spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数(pearson correlation coefficient, Centered Cosine)

#第二步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize

#构建用户-事件矩阵类
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
    经过统计:train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
        #处理性别信息       
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
        
#构建用户和用户之间的相似矩阵类       
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
            #userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
            #构造用户矩阵(将原始数据中的数据进行处理后构建)
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵,之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]  #获取用户u1的索引
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())  #计算两个用户向量之间的相似性,为对称矩阵
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix) 
         
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
Users(pe)
print('第2步完成...\n')
第1步:统计user和event相关信息...
第1步完成...

第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...

image.png

第三步:用户社交关系信息处理

这一步需要user_friends.csv.gz文件,我们先来看看文件内容:

import pandas as pd
df_user_friends = pd.read_csv('user_friends.csv.gz', compression='gzip')
df_user_friends.head()
userfriends
031974683911346449342 3873244116 4226080662 1222907620 54...
135379822731491560444 395798035 2036380346 899375619 3534...
28231837251484954627 1950387873 1652977611 4185960823 42...
3187222384883361640 723814682 557944478 1724049724 253059...
434290177174253303705 2130310957 1838389374 3928735761 71...
  • 1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
  • 2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
# 第三步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
    经过统计:train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
            #userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵,之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友,想法非常简单
    1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
    2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391  用户数目
        
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )  #记录下每个用户的朋友点击事件的次数
        
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
        #获取该用户的Index,和朋友数目
        #对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )  #将用户-朋友数矩阵保存
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)  #将用户-朋友事件点击矩阵保存
         
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
 
print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
第1步:统计user和event相关信息...
第1步完成...

第2步:计算用户相似度信息,并用矩阵形式存储...
第2步完成...

第3步:计算用户社交关系信息,并存储...
Header In User_friends.csv.gz: b'user,friends\n'
Loading line: 0
Loading line: 200
Loading line: 400
Loading line: 600
Loading line: 800
Loading line: 1000
Loading line: 1200
Loading line: 1400
Loading line: 1600
Loading line: 1800
Loading line: 2000
Loading line: 2200
Loading line: 2400
Loading line: 2600
Loading line: 2800
Loading line: 3000
Loading line: 3200
Loading line: 3400
Loading line: 3600
Loading line: 3800
Loading line: 4000
Loading line: 4200
Loading line: 4400
Loading line: 4600
Loading line: 4800
Loading line: 5000
Loading line: 5200
Loading line: 5400
Loading line: 5600
Loading line: 5800
Loading line: 6000
Loading line: 6200
Loading line: 6400
Loading line: 6600
Loading line: 6800
Loading line: 7000
Loading line: 7200
Loading line: 7400
Loading line: 7600
Loading line: 7800
Loading line: 8000
Loading line: 8200
Loading line: 8400
Loading line: 8600
Loading line: 8800
Loading line: 9000
Loading line: 9200
Loading line: 9400
Loading line: 9600
Loading line: 9800
Loading line: 10000
Loading line: 10200
Loading line: 10400
Loading line: 10600
Loading line: 10800
Loading line: 11000
Loading line: 11200
Loading line: 11400
Loading line: 11600
Loading line: 11800
Loading line: 12000
Loading line: 12200
Loading line: 12400
Loading line: 12600
Loading line: 12800
Loading line: 13000
Loading line: 13200
Loading line: 13400
Loading line: 13600
Loading line: 13800
Loading line: 14000
Loading line: 14200
Loading line: 14400
Loading line: 14600
Loading line: 14800
Loading line: 15000
Loading line: 15200
Loading line: 15400
Loading line: 15600
Loading line: 15800
Loading line: 16000
Loading line: 16200
Loading line: 16400
Loading line: 16600
Loading line: 16800
Loading line: 17000
Loading line: 17200
Loading line: 17400
Loading line: 17600
Loading line: 17800
Loading line: 18000
Loading line: 18200
Loading line: 18400
Loading line: 18600
Loading line: 18800
Loading line: 19000
Loading line: 19200
Loading line: 19400
Loading line: 19600
Loading line: 19800
Loading line: 20000
Loading line: 20200
Loading line: 20400
Loading line: 20600
Loading line: 20800
Loading line: 21000
Loading line: 21200
Loading line: 21400
Loading line: 21600
Loading line: 21800
Loading line: 22000
Loading line: 22200
Loading line: 22400
Loading line: 22600
Loading line: 22800
Loading line: 23000
Loading line: 23200
Loading line: 23400
Loading line: 23600
Loading line: 23800
Loading line: 24000
Loading line: 24200
Loading line: 24400
Loading line: 24600
Loading line: 24800
Loading line: 25000
Loading line: 25200
Loading line: 25400
Loading line: 25600
Loading line: 25800
Loading line: 26000
Loading line: 26200
Loading line: 26400
Loading line: 26600
Loading line: 26800
Loading line: 27000
Loading line: 27200
Loading line: 27400
Loading line: 27600
Loading line: 27800
Loading line: 28000
Loading line: 28200
Loading line: 28400
Loading line: 28600
Loading line: 28800
Loading line: 29000
Loading line: 29200
Loading line: 29400
Loading line: 29600
Loading line: 29800
Loading line: 30000
Loading line: 30200
Loading line: 30400
Loading line: 30600
Loading line: 30800
Loading line: 31000
Loading line: 31200
Loading line: 31400
Loading line: 31600
Loading line: 31800
Loading line: 32000
Loading line: 32200
Loading line: 32400
Loading line: 32600
Loading line: 32800
Loading line: 33000
Loading line: 33200
Loading line: 33400
Loading line: 33600
Loading line: 33800
Loading line: 34000
Loading line: 34200
Loading line: 34400
Loading line: 34600
Loading line: 34800
Loading line: 35000
Loading line: 35200
Loading line: 35400
Loading line: 35600
Loading line: 35800
Loading line: 36000
Loading line: 36200
Loading line: 36400
Loading line: 36600
Loading line: 36800
Loading line: 37000
Loading line: 37200
Loading line: 37400
Loading line: 37600
Loading line: 37800
Loading line: 38000
Loading line: 38200
3731377.0
第3步完成...

第四步:构建event和event相似度数据

我们先看看events.csv.gz:

import pandas as pd
df_events_csv = pd.read_csv('events.csv.gz', compression='gzip')
df_events_csv.head()
event_iduser_idstart_timecitystatezipcountrylatlngc_1...c_92c_93c_94c_95c_96c_97c_98c_99c_100c_other
068492175836478640122012-10-31T00:00:00.001ZNaNNaNNaNNaNNaNNaN2...0100000009
124499911934764405212012-11-03T00:00:00.001ZNaNNaNNaNNaNNaNNaN2...0000000007
239284409355175144452012-11-05T00:00:00.001ZNaNNaNNaNNaNNaNNaN0...00000000012
325823451527815857812012-10-30T00:00:00.001ZNaNNaNNaNNaNNaNNaN1...0000000008
4105116585010160985802012-09-27T00:00:00.001ZNaNNaNNaNNaNNaNNaN1...0000000009

5 rows × 110 columns

对上面的信息进行数值转换
1.start_time列的信息使用 datetime库进行处理
2.city,state,zip,country列处理都利用了hashlib包:注意这里处理event信息的时候,只有那些出现在train.csv和test.csv中的event才会进入数值转换程序
import hashlib
def FeatureHash(value):
        if len(value.strip()) == 0:
            return -1
        else:
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4] ,16)

print(FeatureHash('Muaraenim'))#47294
print(FeatureHash('a test demo'))#4030
47294
4030
3.lat和lon列处理

空值用0.0填充,其他转换为自身的float型

def getFloatValue(self, value):
    if len(value.strip()) == 0:
        return 0.0
    else:
        return float(value)
4.c_1之后列(也就是第10列之后)处理
  • 这里用了一个矩阵eventContMatrix来保存c_1到c_100列信息,但是没有用的c_other列
5.将eventPropMatrix和eventContMatrix矩阵归一化后进行文件保存
6.使用uniqueEventPairs来计算event pairs相似度
  • 利用了scipy.spatial.distance的correlation和cosine方法
## 第四步完整代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
import hashlib
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
    经过统计:train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
         
    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            #return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
            #TypeError: Unicode-objects must be encoded before hashing
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
     
    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)
             
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
            #userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵,之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友,想法非常简单
    1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
    2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
        #获取该用户的Index,和朋友数目
        #对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        #print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)
     
 
         
#构造event和event相似度数据
class Events:
    """
    构建event-event相似度,注意这里有2种相似度
    1)由用户-event行为,类似协同过滤算出的相似度
    2)由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
        cleaner = DataCleaner()
        fin = gzip.open('events.csv.gz')
        fin.readline()#skip header
        nevents = len(programEntities.eventIndex)  #事件的数目
        print(nevents)#13418
        self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )  #存储事件-前7列特征
        self.eventContMatrix = ss.dok_matrix( (nevents, 100) ) #存储事件
        ln = 0
        for line in fin:
            #if ln > 10:
                #break
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
                #将10-101列的属性进行统计排布
                for j in range(9, 109):
                    self.eventContMatrix[i, j-9] = cols[j]
                 
            ln += 1
        fin.close()
        #对特征矩阵1进行归一化处理 
        self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
        #对特征矩阵2进行规一划处理
        self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
         
        #calculate similarity between event pairs based on the two matrices
        self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
        self.eventContSim = ss.dok_matrix( (nevents, nevents) )
        for e1, e2 in programEntities.uniqueEventPairs:
            i = programEntities.eventIndex[e1]
            j = programEntities.eventIndex[e2]
            #计算前10列数据的相识度
            if not ((i, j) in self.eventPropSim):
                epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
                self.eventPropSim[i, j] = epsim
                self.eventPropSim[j, i] = epsim
            #计算后面数据的相似度
            if not ((i, j) in self.eventContSim):
                ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
                self.eventContSim[i, j] = ecsim
                self.eventContSim[j, i] = ecsim
                 
        sio.mmwrite('EV_eventPropSim', self.eventPropSim)
        sio.mmwrite('EV_eventContSim', self.eventContSim)
 
         
print('第1步:统计user和event相关信息...')
pe = ProgramEntities()
print('第1步完成...\n')
 
print('第2步:计算用户相似度信息,并用矩阵形式存储...')
#Users(pe)
print('第2步完成...\n')
 
print('第3步:计算用户社交关系信息,并存储...')
UserFriends(pe)
print('第3步完成...\n')
 
print('第4步:计算event相似度信息,并用矩阵形式存储...')
Events(pe)
print('第4步完成...\n')

第五步:活跃度/event热度数据

由于用到event_attendees.csv.gz文件,我们先看看该文件

import pandas as pd
df_events_attendees = pd.read_csv('event_attendees.csv.gz', compression='gzip')
df_events_attendees.head()
eventyesmaybeinvitedno
011598220431975964455 252302513 4226086795 3805886383 142...2733420590 517546982 1350834692 532087573 5831...1723091036 3795873583 4109144917 3560622906 31...3575574655 1077296663
16864672612394228942 2686116898 1056558062 3792942231 41...1498184352 645689144 3770076778 331335845 4239...1788073374 733302094 1830571649 676508092 7081...NaN
21186208412NaN3320380166 38107936971379121209 4406686821728988561 2950720854
32621578336NaNNaNNaNNaN
48558426862406118796 3550897984 294255260 1125817077 109...2671721559 1761448345 2356975806 2666669465 10...1518670705 880919237 2326414227 2673818347 332...3500235232
## 第五步全部代码
from collections import defaultdict
import locale, pycountry
import scipy.sparse as ss
import scipy.io as sio
import itertools
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
 
import scipy.spatial.distance as ssd
import datetime
from sklearn.preprocessing import normalize
 
import gzip
import numpy as np
 
import hashlib
 
#处理user和event关联数据
class ProgramEntities:
    """
    我们只关心train和test中出现的user和event,因此重点处理这部分关联数据,
    经过统计:train和test中总共3391个users和13418个events
    """
    def __init__(self):
        #统计训练集中有多少独立的用户的events
        uniqueUsers = set()#uniqueUsers保存总共多少个用户:3391个
        uniqueEvents = set()#uniqueEvents保存总共多少个events:13418个
        eventsForUser = defaultdict(set)#字典eventsForUser保存了每个user:所对应的event
        usersForEvent = defaultdict(set)#字典usersForEvent保存了每个event:哪些user点击
        for filename in ['train.csv', 'test.csv']:
            f = open(filename)
            f.readline()#跳过第一行
            for line in f:
                cols = line.strip().split(',')
                uniqueUsers.add( cols[0] )
                uniqueEvents.add( cols[1] )
                eventsForUser[cols[0]].add( cols[1] )
                usersForEvent[cols[1]].add( cols[0] )
            f.close()
         
        self.userEventScores = ss.dok_matrix( ( len(uniqueUsers), len(uniqueEvents) ) )
        self.userIndex = dict()
        self.eventIndex = dict()
        for i, u in enumerate(uniqueUsers):
            self.userIndex[u] = i
        for i, e in enumerate(uniqueEvents):
            self.eventIndex[e] = i
             
        ftrain = open('train.csv')
        ftrain.readline()
        for line in ftrain:
            cols = line.strip().split(',')
            i = self.userIndex[ cols[0] ]
            j = self.eventIndex[ cols[1] ]
            self.userEventScores[i, j] = int( cols[4] ) - int( cols[5] )
        ftrain.close()
        sio.mmwrite('PE_userEventScores', self.userEventScores)
         
        #为了防止不必要的计算,我们找出来所有关联的用户或者关联的event
        #所谓关联用户指的是至少在同一个event上有行为的用户user pair
        #关联的event指的是至少同一个user有行为的event pair
        self.uniqueUserPairs = set()
        self.uniqueEventPairs = set()
        for event in uniqueEvents:
            users = usersForEvent[event]
            if len(users) > 2:
                self.uniqueUserPairs.update( itertools.combinations(users, 2) )
        for user in uniqueUsers:
            events = eventsForUser[user]
            if len(events) > 2:
                self.uniqueEventPairs.update( itertools.combinations(events, 2) )
        #rint(self.userIndex)
        cPickle.dump( self.userIndex, open('PE_userIndex.pkl', 'wb'))
        cPickle.dump( self.eventIndex, open('PE_eventIndex.pkl', 'wb') )
         
 
#数据清洗类
class DataCleaner:
    def __init__(self):
        #一些字符串转数值的方法
        #载入locale
        self.localeIdMap = defaultdict(int)
         
        for i, l in enumerate(locale.locale_alias.keys()):
            self.localeIdMap[l] = i + 1
             
        #载入country
        self.countryIdMap = defaultdict(int)
        ctryIdx = defaultdict(int)
        for i, c in enumerate(pycountry.countries):
            self.countryIdMap[c.name.lower()] = i + 1
            if c.name.lower() == 'usa':
                ctryIdx['US'] = i
            if c.name.lower() == 'canada':
                ctryIdx['CA'] = i
             
        for cc in ctryIdx.keys():
            for s in pycountry.subdivisions.get(country_code=cc):
                self.countryIdMap[s.name.lower()] = ctryIdx[cc] + 1
                 
        self.genderIdMap = defaultdict(int, {'male':1, 'female':2})
                 
    #处理LocaleId
    def getLocaleId(self, locstr):
        #这样因为localeIdMap是defaultdict(int),如果key中没有locstr.lower(),就会返回默认int 0
        return self.localeIdMap[ locstr.lower() ]
         
    #处理birthyear
    def getBirthYearInt(self, birthYear):
        try:
            return 0 if birthYear == 'None' else int(birthYear)
        except:
            return 0
             
    #性别处理
    def getGenderId(self, genderStr):
        return self.genderIdMap[genderStr]
         
    #joinedAt
    def getJoinedYearMonth(self, dateString):
        dttm = datetime.datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%S.%fZ")
        return "".join( [str(dttm.year), str(dttm.month) ] )
         
    #处理location
    def getCountryId(self, location):
        if (isinstance( location, str)) and len(location.strip()) > 0 and location.rfind('  ') > -1:
            return self.countryIdMap[ location[location.rindex('  ') + 2: ].lower() ]
        else:
            return 0
                     
    #处理timezone
    def getTimezoneInt(self, timezone):
        try:
            return int(timezone)
        except:
            return 0
         
    def getFeatureHash(self, value):
        if len(value.strip()) == 0:
            return -1
        else:
            #return int( hashlib.sha224(value).hexdigest()[0:4], 16) python3会报如下错误
            #TypeError: Unicode-objects must be encoded before hashing
            return int( hashlib.sha224(value.encode('utf-8')).hexdigest()[0:4], 16)#python必须先进行encode
     
    def getFloatValue(self, value):
        if len(value.strip()) == 0:
            return 0.0
        else:
            return float(value)
             
 
#用户与用户相似度矩阵
class Users:
    """
    构建user/user相似度矩阵
    """
    def __init__(self, programEntities, sim=ssd.correlation):#spatial.distance.correlation(u, v) #计算向量u和v之间的相关系数
        cleaner = DataCleaner()
        nusers = len(programEntities.userIndex.keys())#3391
        #print(nusers)
        fin = open('users.csv')
        colnames = fin.readline().strip().split(',') #7列特征
        self.userMatrix = ss.dok_matrix( (nusers, len(colnames)-1 ) )#构建稀疏矩阵
        for line in fin:
            cols = line.strip().split(',')
            #只考虑train.csv中出现的用户,这一行是作者注释上的,但是我不是很理解
            #userIndex包含了train和test的所有用户,为何说只考虑train.csv中出现的用户
            if cols[0] in programEntities.userIndex:
                i = programEntities.userIndex[ cols[0] ]#获取user:对应的index
                self.userMatrix[i, 0] = cleaner.getLocaleId( cols[1] )#locale
                self.userMatrix[i, 1] = cleaner.getBirthYearInt( cols[2] )#birthyear,空值0填充
                self.userMatrix[i, 2] = cleaner.getGenderId( cols[3] )#处理性别
                self.userMatrix[i, 3] = cleaner.getJoinedYearMonth( cols[4] )#处理joinedAt列
                self.userMatrix[i, 4] = cleaner.getCountryId( cols[5] )#处理location
                self.userMatrix[i, 5] = cleaner.getTimezoneInt( cols[6] )#处理timezone
        fin.close()
         
        #归一化矩阵
        self.userMatrix = normalize(self.userMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('US_userMatrix', self.userMatrix)
         
        #计算用户相似度矩阵,之后会用到
        self.userSimMatrix = ss.dok_matrix( (nusers, nusers) )#(3391,3391)
        for i in range(0, nusers):
            self.userSimMatrix[i, i] = 1.0
         
        for u1, u2 in programEntities.uniqueUserPairs:
            i = programEntities.userIndex[u1]
            j = programEntities.userIndex[u2]
            if (i, j) not in self.userSimMatrix:
                #print(self.userMatrix.getrow(i).todense()) 如[[0.00028123,0.00029847,0.00043592,0.00035208,0,0.00032346]]
                #print(self.userMatrix.getrow(j).todense()) 如[[0.00028123,0.00029742,0.00043592,0.00035208,0,-0.00032346]]
                usim = sim(self.userMatrix.getrow(i).todense(),self.userMatrix.getrow(j).todense())
                self.userSimMatrix[i, j] = usim
                self.userSimMatrix[j, i] = usim
        sio.mmwrite('US_userSimMatrix', self.userSimMatrix)
 
 
#用户社交关系挖掘
class UserFriends:
    """
    找出某用户的那些朋友,想法非常简单
    1)如果你有更多的朋友,可能你性格外向,更容易参加各种活动
    2)如果你朋友会参加某个活动,可能你也会跟随去参加一下
    """
    def __init__(self, programEntities):
        nusers = len(programEntities.userIndex.keys())#3391
        self.numFriends = np.zeros( (nusers) )#array([0., 0., 0., ..., 0., 0., 0.]),保存每一个用户的朋友数
        self.userFriends = ss.dok_matrix( (nusers, nusers) )
        fin = gzip.open('user_friends.csv.gz')
        print( 'Header In User_friends.csv.gz:',fin.readline() )
        ln = 0
        #逐行打开user_friends.csv.gz文件
        #判断第一列的user是否在userIndex中,只有user在userIndex中才是我们关心的user
        #获取该用户的Index,和朋友数目
        #对于该用户的每一个朋友,如果朋友也在userIndex中,获取其朋友的userIndex,然后去userEventScores中获取该朋友对每个events的反应
        #score即为该朋友对所有events的平均分
        #userFriends矩阵记录了用户和朋友之间的score
        #如851286067:1750用户出现在test.csv中,该用户在User_friends.csv.gz中一共2151个朋友
        #那么其朋友占比应该是2151 / 总的朋友数sumNumFriends=3731377.0 = 2151 / 3731377 = 0.0005764627910822198
        for line in fin:
            if ln % 200 == 0:
                print( 'Loading line:', ln )
            cols = line.decode().strip().split(',')
            user = cols[0]
            if user in programEntities.userIndex:
                friends = cols[1].split(' ')#获得该用户的朋友列表
                i = programEntities.userIndex[user]
                self.numFriends[i] = len(friends)
                for friend in friends:
                    if friend in programEntities.userIndex:
                        j = programEntities.userIndex[friend]
                        #the objective of this score is to infer the degree to
                        #and direction in which this friend will influence the
                        #user's decision, so we sum the user/event score for
                        #this user across all training events
                        eventsForUser = programEntities.userEventScores.getrow(j).todense()#获取朋友对每个events的反应:0, 1, or -1
                        #print(eventsForUser.sum(), np.shape(eventsForUser)[1] )
                        #socre即是用户朋友在13418个events上的平均分
                        score = eventsForUser.sum() / np.shape(eventsForUser)[1]#eventsForUser = 13418,
                        #print(score)
                        self.userFriends[i, j] += score
                        self.userFriends[j, i] += score
            ln += 1
        fin.close()
        #归一化数组
        sumNumFriends = self.numFriends.sum(axis=0)#每个用户的朋友数相加
        #print(sumNumFriends)
        self.numFriends = self.numFriends / sumNumFriends#每个user的朋友数目比例
        sio.mmwrite('UF_numFriends', np.matrix(self.numFriends) )
        self.userFriends = normalize(self.userFriends, norm='l1', axis=0, copy=False)
        sio.mmwrite('UF_userFriends', self.userFriends)
     
 
         
#构造event和event相似度数据
class Events:
    """
    构建event-event相似度,注意这里有2种相似度
    1)由用户-event行为,类似协同过滤算出的相似度
    2)由event本身的内容(event信息)计算出的event-event相似度
    """
    def __init__(self, programEntities, psim=ssd.correlation, csim=ssd.cosine):
        cleaner = DataCleaner()
        fin = gzip.open('events.csv.gz')
        fin.readline()#skip header
        nevents = len(programEntities.eventIndex)
        print(nevents)#13418
        self.eventPropMatrix = ss.dok_matrix( (nevents, 7) )
        self.eventContMatrix = ss.dok_matrix( (nevents, 100) )
        ln = 0
        for line in fin:
            #if ln > 10:
                #break
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPropMatrix[i, 0] = cleaner.getJoinedYearMonth( cols[2] )#start_time
                self.eventPropMatrix[i, 1] = cleaner.getFeatureHash( cols[3] )#city
                self.eventPropMatrix[i, 2] = cleaner.getFeatureHash( cols[4] )#state
                self.eventPropMatrix[i, 3] = cleaner.getFeatureHash( cols[5] )#zip
                self.eventPropMatrix[i, 4] = cleaner.getFeatureHash( cols[6] )#country
                self.eventPropMatrix[i, 5] = cleaner.getFloatValue( cols[7] )#lat
                self.eventPropMatrix[i, 6] = cleaner.getFloatValue( cols[8] )#lon
                for j in range(9, 109):
                    self.eventContMatrix[i, j-9] = cols[j]
     
                 
            ln += 1
        fin.close()
         
        self.eventPropMatrix = normalize(self.eventPropMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventPropMatrix', self.eventPropMatrix)
        self.eventContMatrix = normalize(self.eventContMatrix, norm='l1', axis=0, copy=False)
        sio.mmwrite('EV_eventContMatrix', self.eventContMatrix)
         
        #calculate similarity between event pairs based on the two matrices
        self.eventPropSim = ss.dok_matrix( (nevents, nevents) )
        self.eventContSim = ss.dok_matrix( (nevents, nevents) )
        for e1, e2 in programEntities.uniqueEventPairs:
            i = programEntities.eventIndex[e1]
            j = programEntities.eventIndex[e2]
            if not ((i, j) in self.eventPropSim):
                epsim = psim( self.eventPropMatrix.getrow(i).todense(), self.eventPropMatrix.getrow(j).todense())
                if np.isnan(epsim):
                    epsim = 0
                self.eventPropSim[i, j] = epsim
                self.eventPropSim[j, i] = epsim
                 
            if not ((i, j) in self.eventContSim):
                #两个向量,如果某个全为0,会返回nan
                """
                import numpy as np
                a = np.array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0])
                b = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
                 
                from scipy.spatial.distance import cosine
                temp = cosine(a, b)
                会出现下面问题:
                Warning (from warnings module):
                File "D:\Python35\lib\site-packages\scipy\spatial\distance.py", line 644
                dist = 1.0 - uv / np.sqrt(uu * vv)
                RuntimeWarning: invalid value encountered in double_scalars
                 
                """
                ecsim = csim( self.eventContMatrix.getrow(i).todense(), self.eventContMatrix.getrow(j).todense())
                if np.isnan(ecsim):
                    ecsim = 0
                self.eventContSim[i, j] = ecsim
                self.eventContSim[j, i] = ecsim
                 
        sio.mmwrite('EV_eventPropSim', self.eventPropSim)
        sio.mmwrite('EV_eventContSim', self.eventContSim)
#第五步
class EventAttendees:
    """
    统计某个活动,参加和不参加的人数,从而为活动活跃度做准备
    """
    def __init__(self, programEntities):
        nevents = len(programEntities.eventIndex)#13418  事件的总数
        self.eventPopularity = ss.dok_matrix( (nevents, 1) )
        f = gzip.open('event_attendees.csv.gz')
        f.readline()#skip header
        for line in f:
            cols = line.decode().strip().split(',')
            eventId = cols[0]
            if eventId in programEntities.eventIndex:
                i = programEntities.eventIndex[eventId]
                self.eventPopularity[i, 0] = len(cols[1].split(' ')) - len(cols[4].split(' '))#yes人数-no人数,即出席人数减未出席人数
        f.close()
         
        self.eventPopularity = normalize( self.eventPopularity, norm='l1', axis=0, copy=False)
        sio.mmwrite('EA_eventPopularity', self.eventPopularity)
         
    
def data_prepare():
    """
    计算生成所有的数据,用矩阵或者其他形式存储方便后续提取特征和建模
    """
    print('第1步:统计user和event相关信息...')
    pe = ProgramEntities()
    print('第1步完成...\n')
 
    print('第2步:计算用户相似度信息,并用矩阵形式存储...')
    Users(pe)
    print('第2步完成...\n')
 
    print('第3步:计算用户社交关系信息,并存储...')
    UserFriends(pe)
    print('第3步完成...\n')
 
    print('第4步:计算event相似度信息,并用矩阵形式存储...')
    Events(pe)
    print('第4步完成...\n')
 
    print('第5步:计算event热度信息...')
    EventAttendees(pe)
    print('第5步完成...\n')
 
 
#运行进行数据准备
data_prepare()

6.特征构建

#这是特征构建部分
 
#import cPickle
#From python3, cPickle has beed replaced by _pickle
import _pickle as cPickle
import scipy.io as sio
import numpy as np
 
class DataRewriter:
    def __init__(self):
        #读入数据做初始化
        self.userIndex = cPickle.load( open('PE_userIndex.pkl','rb') )
        self.eventIndex = cPickle.load( open('PE_eventIndex.pkl', 'rb') )
        self.userEventScores = sio.mmread('PE_userEventScores').todense()
        self.userSimMatrix = sio.mmread('US_userSimMatrix').todense()  
        self.eventPropSim = sio.mmread('EV_eventPropSim').todense() 
        self.eventContSim = sio.mmread('EV_eventContSim').todense()  
        self.numFriends = sio.mmread('UF_numFriends')
        self.userFriends = sio.mmread('UF_userFriends').todense()
        self.eventPopularity = sio.mmread('EA_eventPopularity').todense()
         
     
    def userReco(self, userId, eventId):
        """
        根据User-based协同过滤,得到event的推荐度
        基本的伪代码思路如下:
                for item in i
                        for every other user v that has a preference for i
                                compute similarity s between u and v
                                incorporate v's preference for i weighted by s into running average
                return top items ranked by weighted average
         
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        vs = self.userEventScores[:, j]
        sims = self.userSimMatrix[i, :]
        prod = sims * vs
        try:
            return prod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            return 0
         
    def eventReco(self, userId, eventId):
        """
        根据基于物品的协同过滤,得到Event的推荐度
        基本的伪代码思路:
        for item i:
            for every item j that u has a preference for
                compute similarity s between i and j
                add u's preference for j weighted by s to a running average
        return top items, ranked by weighted average
        """
        i = self.userIndex[userId]
        j = self.eventIndex[eventId]
        js = self.userEventScores[i, :]
        psim = self.eventPropSim[:, j]
        csim = self.eventContSim[:, j]
        pprod = js * psim
        cprod = js * csim
        pscore = 0
        cscore = 0
        try:
            pscore = pprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
         
        try:
            cscore = cprod[0, 0] - self.userEventScores[i, j]
        except IndexError:
            pass
         
        return pscore, cscore
     
    def userPop(self, userId):
        """
        基于用户的朋友个数来推断用户的社交程度
        主要的考量是如果用户的朋友非常多,可能会更倾向于参加各种社交活动
        """
         
        if userId in self.userIndex:
            i = self.userIndex[userId]
            try:
                return self.numFriends[0, i]
            except IndexError:
                return 0
        else:
            return 0
         
    def friendInfluence(self, userId):
        """
        朋友对用户的影响
        主要考虑用户的所有朋友中,有多少是非常喜欢参加各种社交活动(event)的
        用户的朋友圈如果都是积极参加各种event,可能会对当前用户有一定的影响
        """
        nusers = np.shape(self.userFriends)[1]
        i = self.userIndex[userId]
        #下面的一行代码是不是有问题呢?
        #是不是应该为某个用户的所有朋友的兴趣分之和,然后除以nusers,也就是axis应该=1
        return (self.userFriends[i, :].sum(axis=0) / nusers)[0, 0]
     
    def eventPop(self, eventId):
        """
        活动本身的热度
        主要通过参与的参数来界定的
        """
        i = self.eventIndex[eventId]
        return self.eventPopularity[i, 0]
     
    def rewriteData(self, start=1, train=True, header=True):
        """
        把前面user-based协同过滤和item-based协同过滤以及各种热度和影响度作为特征组合在一起
        生成新的train,用于分类器分类使用
        """
        fn = 'train.csv' if train else 'test.csv'
        fin = open(fn)
        fout = open('data_' + fn, 'w')
        #write output header
        if header:
            ocolnames = ['invited', 'user_reco', 'evt_p_reco', 'evt_c_reco', 'user_pop', 'frnd_infl', 'evt_pop']
            if train:
                ocolnames.append('interested')
                ocolnames.append('not_interested')
            fout.write( ','.join(ocolnames) + '\n' )
 
        ln = 0
        for line in fin:
            ln += 1
            if ln < start:
                continue
            cols = line.strip().split(',')
            #user,event,invited,timestamp,interested,not_interested
            userId = cols[0]
            eventId = cols[1]
            invited = cols[2]
            if ln % 500 == 0:
                print("%s : %d (userId, eventId) = (%s, %s)" % (fn, ln, userId, eventId))
                 
            user_reco = self.userReco( userId, eventId )
            evt_p_reco, evt_c_reco = self.eventReco( userId, eventId )
            user_pop = self.userPop( userId )
            frnd_infl = self.friendInfluence( userId )
            evt_pop = self.eventPop( eventId )
            ocols = [invited, user_reco, evt_p_reco, evt_c_reco, user_pop, frnd_infl, evt_pop]
             
            if train:
                ocols.append( cols[4] )#interested
                ocols.append( cols[5] )#not_interested
                 
            fout.write(','.join( map(lambda x: str(x), ocols)) + '\n')
             
        fin.close()
        fout.close()
         
    def rewriteTrainingSet(self):
        self.rewriteData(True)
 
    def rewriteTestSet(self):
        self.rewriteData(False)
 
dr = DataRewriter()
print('生成训练数据...\n')
dr.rewriteData(train=True, start=2, header=True)
 
print('生成预测数据...\n')
dr.rewriteData(train=False, start=2, header=True)
print('done')

第七步:模型构建与预测

import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')
 
def train():
    """
    在我们得到的特征上训练分类器,target为1(感兴趣),或者是0(不感兴趣)
    """
    trainDf = pd.read_csv('data_train.csv')
    X = np.matrix( pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
                    'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
    y = np.array(trainDf.interested)
     
    clf = SGDClassifier(loss='log', penalty='l2')
    clf.fit(X, y)
    return clf
 
def validate():
    """
    10折的交叉验证,并输出交叉验证的平均准确率
    """
    trainDf = pd.read_csv('data_train.csv')
    X = np.matrix(pd.DataFrame(trainDf, index=None, columns=['invited', 'user_reco', 'evt_p_reco',
                    'evt_c_reco','user_pop', 'frnd_infl', 'evt_pop']) )
    y = np.array(trainDf.interested)
     
    nrows = len(trainDf)
    kfold = KFold(n_splits=10,shuffle=False)
    avgAccuracy = 0
    run = 0
    for train, test in kfold.split(X, y):
        Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
        clf = SGDClassifier(loss='log', penalty='l2')
        clf.fit(Xtrain, ytrain)
        accuracy = 0
        ntest = len(ytest)
        for i in range(0, ntest):
            yt = clf.predict(Xtest[i, :])
            if yt == ytest[i]:
                accuracy += 1
                 
        accuracy = accuracy / ntest
        print('accuracy(run %d) : %f' % (run, accuracy) )
         
def test(clf):
    """
    读取test数据,用分类器完成预测
    """
    origTestDf = pd.read_csv("test.csv")
    users = origTestDf.user
    events = origTestDf.event
     
    testDf = pd.read_csv("data_test.csv")
    fout = open("result.csv", 'w')
    fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
     
    nrows = len(testDf)
    Xp = np.matrix(testDf)
    yp = np.zeros((nrows, 2))
     
    for i in range(0, nrows):
        xp = Xp[i, :]
        yp[i, 0] = clf.predict(xp)
        yp[i, 1] = clf.decision_function(xp)
        fout.write(",".join( map( lambda x: str(x), [users[i], events[i], yp[i, 0], yp[i, 1]] ) ) + "\n")
    fout.close()
         
clf = train()
validate()
test(clf)
print('done')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值