import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train_df = pd.read_csv('train.csv')
train_df.head()
| id | target | comment_text | severe_toxicity | obscene | identity_attack | insult | threat | asian | atheist | ... | article_id | rating | funny | wow | sad | likes | disagree | sexual_explicit | identity_annotator_count | toxicity_annotator_count |
---|
0 | 59848 | 0.000000 | This is so cool. It's like, 'would you want yo... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
---|
1 | 59849 | 0.000000 | Thank you!! This would make my life a lot less... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
---|
2 | 59852 | 0.000000 | This is such an urgent design problem; kudos t... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
---|
3 | 59855 | 0.000000 | Is this something I'll be able to install on m... | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | ... | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 | 4 |
---|
4 | 59856 | 0.893617 | haha you guys are a bunch of losers. | 0.021277 | 0.0 | 0.021277 | 0.87234 | 0.0 | 0.0 | 0.0 | ... | 2006 | rejected | 0 | 0 | 0 | 1 | 0 | 0.0 | 4 | 47 |
---|
test_df = pd.read_csv('test.csv')
test_df.head()
id | comment_text |
---|
0 | 7000000 | Jeff Sessions is another one of Trump's Orwell... |
---|
1 | 7000001 | I actually inspected the infrastructure on Gra... |
---|
2 | 7000002 | No it won't . That's just wishful thinking on ... |
---|
3 | 7000003 | Instead of wringing our hands and nibbling the... |
---|
4 | 7000004 | how many of you commenters have garbage piled ... |
---|
对于测试集应该做的就是提取出类似训练集一样的关键字,并对每一样本进行标注,最后进行训练。
column_name = train_df.columns.values.tolist()
['id', 'target', 'comment_text', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian', 'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian', 'intellectual_or_learning_disability', 'jewish', 'latino', 'male', 'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity', 'other_religion', 'other_sexual_orientation', 'physical_disability', 'psychiatric_or_mental_illness', 'transgender', 'white', 'created_date', 'publication_id', 'parent_id', 'article_id', 'rating', 'funny', 'wow', 'sad', 'likes', 'disagree', 'sexual_explicit', 'identity_annotator_count', 'toxicity_annotator_count']
train_df_1 = train_df.iloc[:,3:23]
train_df_1.head()
severe_toxicity | obscene | identity_attack | insult | threat | asian | atheist | bisexual | black | buddhist | christian | female | heterosexual | hindu | homosexual_gay_or_lesbian | intellectual_or_learning_disability | jewish | latino | male | muslim |
---|
0 | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
---|
1 | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
---|
2 | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
---|
3 | 0.000000 | 0.0 | 0.000000 | 0.00000 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
---|
4 | 0.021277 | 0.0 | 0.021277 | 0.87234 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.25 | 0.0 | 0.0 | 0.0 | 0.0 |
---|
train_df_2 = train_df.iloc[:,24:44]
train_df_2.head()
| other_gender | other_race_or_ethnicity | other_religion | other_sexual_orientation | physical_disability | psychiatric_or_mental_illness | transgender | white | created_date | publication_id | parent_id | article_id | rating | funny | wow | sad | likes | disagree | sexual_explicit | identity_annotator_count |
---|
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2015-09-29 10:50:41.987077+00 | 2 | NaN | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 |
---|
1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2015-09-29 10:50:42.870083+00 | 2 | NaN | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 |
---|
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2015-09-29 10:50:45.222647+00 | 2 | NaN | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 |
---|
3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2015-09-29 10:50:47.601894+00 | 2 | NaN | 2006 | rejected | 0 | 0 | 0 | 0 | 0 | 0.0 | 0 |
---|
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2015-09-29 10:50:48.488476+00 | 2 | NaN | 2006 | rejected | 0 | 0 | 0 | 1 | 0 | 0.0 | 4 |
---|
def missing_data(data):
total = data.isnull().sum()
percent = (data.isnull().sum()/data.isnull().count()*100)
tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
types = []
for col in data.columns:
dtype = str(data[col].dtype)
types.append(dtype)
tt['Types'] = types
return(np.transpose(tt))
%%time
missing_data(train_df_1)
| severe_toxicity | obscene | identity_attack | insult | threat | asian | atheist | bisexual | black | buddhist | christian | female | heterosexual | hindu | homosexual_gay_or_lesbian | intellectual_or_learning_disability | jewish | latino | male | muslim |
---|
Total | 0 | 0 | 0 | 0 | 0 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 |
---|
Percent | 0 | 0 | 0 | 0 | 0 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 |
---|
Types | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 |
---|
%%time
missing_data(train_df_2)
| other_gender | other_race_or_ethnicity | other_religion | other_sexual_orientation | physical_disability | psychiatric_or_mental_illness | transgender | white | created_date | publication_id | parent_id | article_id | rating | funny | wow | sad | likes | disagree | sexual_explicit | identity_annotator_count |
---|
Total | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 1399744 | 0 | 0 | 778646 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
---|
Percent | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 77.5536 | 0 | 0 | 43.1413 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
---|
Types | float64 | float64 | float64 | float64 | float64 | float64 | float64 | float64 | object | int64 | float64 | int64 | object | int64 | int64 | int64 | int64 | int64 | float64 | int64 |
---|