第八章 文本数据
1. str对象
import numpy as np
import pandas as pd
var = 'abcd'
str.upper(var)
'ABCD'
s = pd.Series(['0abcd', 'efg', 'hi'])
s.str.upper()
0 0ABCD
1 EFG
2 HI
dtype: object
var[-1:0:-2]
'db'
s.str[-1:0:-2]
0 db
1 g
2 i
dtype: object
s.str[2]
0 b
1 g
2 NaN
dtype: object
s = pd.Series([{1:'temp_1',2:'temp_2'}, ['a','b'],0.5,'my_string'])
s
0 {1: 'temp_1', 2: 'temp_2'}
1 [a, b]
2 0.5
3 my_string
dtype: object
s.str[1]
0 temp_1
1 b
2 NaN
3 y
dtype: object
s.astype('string').str[1]
0 1
1 '
2 .
3 y
dtype: string
s.astype('string')
0 {1: 'temp_1', 2: 'temp_2'}
1 ['a', 'b']
2 0.5
3 my_string
dtype: string
2. 正则表达式基础
import re
re.findall('Apple','Apple! This Is an Apple!')
['Apple', 'Apple']
re.findall(r'.', 'abc')
re.findall(r'[ac]', 'abc')
re.findall(r'[^ac]', 'abc')
re.findall(r'[ab]{2}', 'aaaabbbb')
re.findall(r'aaa|bbb', 'aaaabbbb')
re.findall(r'a\?|a\*', 'aa?a*a')
re.findall(r'a?.', 'abaacadaae')
['ab', 'aa', 'c', 'ad', 'aa', 'e']
re.findall(r'.s','Apple! This Is an Apple!')
re.findall(r'\w{2}','09 8? 7w c_ 9q p@')
re.findall(r'\w\W\B','09 8? 7w c_ 9q p@')
re.findall(r'\w\s\w','Constant dropping wears the stone.')
re.findall(r'.\s.', 'Constant dropping wears the stone.')
re.findall(r'上海市(.{2,3}区)(.{2,3}路)(\d+号)','上海市黄浦区方浜中路249号 上海市宝山区密山路5号')
[('黄浦区', '方浜中路', '249号'), ('宝山区', '密山路', '5号')]
3. 文本处理的五类操作
3.1 拆分
s = pd.Series(['上海市黄浦区方浜中路249号', '上海市宝山区密山路5号'])
s.str.split('[市区路]')
0 [上海, 黄浦, 方浜中, 249号]
1 [上海, 宝山, 密山, 5号]
dtype: object
s.str.split('[市区路]', n=2, expand=True)
| 0 | 1 | 2 |
|---|
| 0 | 上海 | 黄浦 | 方浜中路249号 |
|---|
| 1 | 上海 | 宝山 | 密山路5号 |
|---|
3.2 合并
s = pd.Series([['a','b'], [1, 'a'], [['a', 'b'], 'c']])
s
0 [a, b]
1 [1, a]
2 [[a, b], c]
dtype: object
s.str.join('-')
0 a-b
1 NaN
2 NaN
dtype: object
s1 = pd.Series(['a','b'])
s2 = pd.Series(['cat','dog'])
s1.str.cat(s2, sep='-')
0 a-cat
1 b-dog
dtype: object
s2.index = [1,2]
s1.str.cat(s2, sep='-', na_rep='?', join='outer')
0 a-?
1 b-cat
2 ?-dog
dtype: object
3.3 匹配
s = pd.Series(['my cat', 'he is fat', 'railway station'])
s.str.contains('\s\wat')
0 True
1 True
2 False
dtype: bool
s.str.startswith('my')
0 True
1 False
2 False
dtype: bool
s.str.match('m|h')
0 True
1 True
2 False
dtype: bool
s.str[::-1]
s.str[::-1].str.match('ta[f|g]|n')
0 False
1 True
2 True
dtype: bool
s.str.contains('^[m|h]')
0 True
1 True
2 False
dtype: bool
s.str.contains('[f|g]at|n$')
0 False
1 True
2 True
dtype: bool
s = pd.Series(['This is an apple. That is not an apple.'])
s.str.find('apple')
0 11
dtype: int64
s.str.rfind('apple')
0 33
dtype: int64
3.4 替换
str.replace 和 replace 并不是一个函数,在使用字符串替换时应当使用前者
s = pd.Series(['a_1_b','c_?'])
s.str.replace('\d|\?','new',regex=True)
0 a_new_b
1 c_new
dtype: object
s = pd.Series(['上海市黄浦区方浜中路249号', '上海市宝山区密山路5号', '北京市昌平区北农路2号'])
pat = '(\w+市)(\w+区)(\w+路)(\d+号)'
city = {'上海市':'Shanghai', '北京市':'Beijing'}
district = {'昌平区': 'CP District', '黄浦区': 'HP District', '宝山区': 'BS District'}
road = {'方浜中路': 'Mid Fangbin Road', '密山路': 'Mishan Road', '北农路': 'Beinong Road'}
def my_func(m):
str_city = city[m.group(1)]
str_district = district[m.group(2)]
str_road = road[m.group(3)]
str_no = 'No. ' + m.group(4)[:-1]
return ' '.join([str_city,
str_district,
str_road,
str_no])
s.str.replace(pat, my_func, regex=True)
0 Shanghai HP District Mid Fangbin Road No. 249
1 Shanghai BS District Mishan Road No. 5
2 Beijing CP District Beinong Road No. 2
dtype: object
pat = '(?P<市名>\w+市)(?P<区名>\w+区)(?P<路名>\w+路)(?P<编号>\d+号)'
def my_func(m):
str_city = city[m.group('市名')]
str_district = district[m.group('区名')]
str_road = road[m.group('路名')]
str_no = 'No. ' + m.group('编号')[:-1]
return ' '.join([str_city,
str_district,
str_road,
str_no])
s.str.replace(pat, my_func, regex=True)
0 Shanghai HP District Mid Fangbin Road No. 249
1 Shanghai BS District Mishan Road No. 5
2 Beijing CP District Beinong Road No. 2
dtype: object
3.4 提取
pat = '(\w+市)(\w+区)(\w+路)(\d+号)'
s.str.extract(pat)
| 0 | 1 | 2 | 3 |
|---|
| 0 | 上海市 | 黄浦区 | 方浜中路 | 249号 |
|---|
| 1 | 上海市 | 宝山区 | 密山路 | 5号 |
|---|
| 2 | 北京市 | 昌平区 | 北农路 | 2号 |
|---|
pat = '(?P<市名>\w+市)(?P<区名>\w+区)(?P<路名>\w+路)(?P<编号>\d+号)'
s.str.extract(pat)
| 市名 | 区名 | 路名 | 编号 |
|---|
| 0 | 上海市 | 黄浦区 | 方浜中路 | 249号 |
|---|
| 1 | 上海市 | 宝山区 | 密山路 | 5号 |
|---|
| 2 | 北京市 | 昌平区 | 北农路 | 2号 |
|---|
s = pd.Series(['A135T15,A26S5','B674S2,B25T6'], index = ['my_A','my_B'])
s
my_A A135T15,A26S5
my_B B674S2,B25T6
dtype: object
pat = '[A|B](\d+)[T|S](\d+)'
s.str.extractall(pat)
| | 0 | 1 |
|---|
| match | | |
|---|
| my_A | 0 | 135 | 15 |
|---|
| 1 | 26 | 5 |
|---|
| my_B | 0 | 674 | 2 |
|---|
| 1 | 25 | 6 |
|---|
pat_with_name = '[A|B](?P<name1>\d+)[T|S](?P<name2>\d+)'
s.str.extractall(pat_with_name)
| | name1 | name2 |
|---|
| match | | |
|---|
| my_A | 0 | 135 | 15 |
|---|
| 1 | 26 | 5 |
|---|
| my_B | 0 | 674 | 2 |
|---|
| 1 | 25 | 6 |
|---|
s.str.findall(pat)
my_A [(135, 15), (26, 5)]
my_B [(674, 2), (25, 6)]
dtype: object
4. 常用字符串函数
4.1 字母型函数
s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
s.str.upper()
s.str.lower()
s.str.title()
s.str.capitalize()
s.str.swapcase()
0 LOWER
1 capitals
2 THIS IS A SENTENCE
3 sWaPcAsE
dtype: object
4.2 数值型函数
pd.to_numeric对字符格式的数值进行快速转换和筛选
errors 非数值的处理模式。raise-直接报错,coerce-设为缺失,ignore-保持原来的字符串downcast 非数值的转换类型
s = pd.Series(['1', '2.2', '2e', '??', '-2.1', '0'])
pd.to_numeric(s, errors='ignore')
0 1
1 2.2
2 2e
3 ??
4 -2.1
5 0
dtype: object
pd.to_numeric(s, errors='coerce')
0 1.0
1 2.2
2 NaN
3 NaN
4 -2.1
5 0.0
dtype: float64
s[pd.to_numeric(s, errors='coerce').isna()]
2 2e
3 ??
dtype: object
4.3 统计型函数
- count 返回出现正则模式的次数
- len 返回出现正则模式的字符串长度
s = pd.Series(['cat rat fat at', 'get feed sheet heat'])
s.str.count('[r|f]at|ee')
0 2
1 2
dtype: int64
s.str.len()
0 14
1 19
dtype: int64
4.4 格式型函数
- 除空型
strip-去除两侧空格 rstrip-去除右侧空格 lstrip-去除左侧空格 - 填充型
pad
my_index = pd.Index([' col1', 'col2 ', ' col3 '])
my_index.str.strip().str.len()
my_index.str.rstrip().str.len()
my_index.str.lstrip().str.len()
Int64Index([4, 5, 5], dtype='int64')
s = pd.Series(['a','b','c'])
s.str.pad(5, 'left', '*')
s.str.pad(5, 'right', '*')
s.str.pad(5, 'both', '*')
0 **a**
1 **b**
2 **c**
dtype: object
s = pd.Series([7, 155, 303000]).astype('string')
s.str.pad(5, 'left', '0')
s.str.pad(6, 'left', '0')
s.str.rjust(6,'0')
s.str.zfill(6)
0 000007
1 000155
2 303000
dtype: string
5. 练一练
Ex1:房屋信息数据集
我的答案
df = pd.read_excel('data/house_info.xls', usecols=['floor','year','area','price'])
df.head()
| floor | year | area | price |
|---|
| 0 | 高层(共6层) | 1986年建 | 58.23㎡ | 155万 |
|---|
| 1 | 中层(共20层) | 2020年建 | 88㎡ | 155万 |
|---|
| 2 | 低层(共28层) | 2010年建 | 89.33㎡ | 365万 |
|---|
| 3 | 低层(共20层) | 2014年建 | 82㎡ | 308万 |
|---|
| 4 | 高层(共1层) | 2015年建 | 98㎡ | 117万 |
|---|
df['year'] = df['year'].str.replace('年建','')
df[['Level','Highest']]=df['floor'].str.split('(共',expand=True)
df['Highest'] = df['Highest'].str.replace('层)','')
df = df.drop('floor',1)
df.head()
| year | area | price | Level | Highest |
|---|
| 0 | 1986 | 58.23㎡ | 155万 | 高层 | 6 |
|---|
| 1 | 2020 | 88㎡ | 155万 | 中层 | 20 |
|---|
| 2 | 2010 | 89.33㎡ | 365万 | 低层 | 28 |
|---|
| 3 | 2014 | 82㎡ | 308万 | 低层 | 20 |
|---|
| 4 | 2015 | 98㎡ | 117万 | 高层 | 1 |
|---|
df['price'] = pd.to_numeric(df['price'].str.replace('万',''),errors='coerce')
df['price'] = df['price']*10000
area_num = pd.to_numeric(df['area'].str.replace('㎡',''),errors='coerce')
df['avg_price'] = df['price']/area_num
df['avg_price'] = df['avg_price'].apply(lambda x:str(int(x))+'元/平米')
df
| year | area | price | Level | Highest | avg_price |
|---|
| 0 | 1986 | 58.23㎡ | 1550000.0 | 高层 | 6 | 26618元/平米 |
|---|
| 1 | 2020 | 88㎡ | 1550000.0 | 中层 | 20 | 17613元/平米 |
|---|
| 2 | 2010 | 89.33㎡ | 3650000.0 | 低层 | 28 | 40859元/平米 |
|---|
| 3 | 2014 | 82㎡ | 3080000.0 | 低层 | 20 | 37560元/平米 |
|---|
| 4 | 2015 | 98㎡ | 1170000.0 | 高层 | 1 | 11938元/平米 |
|---|
| ... | ... | ... | ... | ... | ... | ... |
|---|
| 31563 | 2010 | 391.13㎡ | 100000000.0 | 中层 | 39 | 255669元/平米 |
|---|
| 31564 | 2006 | 283㎡ | 26000000.0 | 高层 | 54 | 91872元/平米 |
|---|
| 31565 | 2011 | 245㎡ | 25000000.0 | 高层 | 16 | 102040元/平米 |
|---|
| 31566 | 2006 | 284㎡ | 35000000.0 | 高层 | 62 | 123239元/平米 |
|---|
| 31567 | 2008 | 224㎡ | 23000000.0 | 低层 | 22 | 102678元/平米 |
|---|
31568 rows × 6 columns
参考答案
df.year = pd.to_numeric(df.year.str[:-2]).astype('Int64')
pat = '(\w层)(共(\d+)层)'
new_cols = df.floor.str.extract(pat).rename(columns={0:'Level', 1:'Highest'})
df = pd.concat([df.drop(columns=['floor']), new_cols], 1)
s_area = pd.to_numeric(df.area.str[:-1])
s_price = pd.to_numeric(df.price.str[:-1])
df['avg_price'] = ((s_price/s_area)*10000).astype('int').astype('string') + '元/平米'
Ex2:《权力的游戏》剧本数据集
参考答案
这道题我智障了…
- 我以为1-2问中的台词数是要统计
Sentence每行的句子数,然后再求和 得到一共多少句台词。想多了。 - 第3问没有受到影响
df = pd.read_csv('data/script.csv')
df.columns = df.columns.str.strip()
df.groupby(['Season', 'Episode'])['Sentence'].count().head()
df.set_index('Name').Sentence.str.split().str.len().groupby('Name').mean().sort_values(ascending=False).head()
s = pd.Series(df.Sentence.values, index=df.Name.shift(-1))
s.str.count('\?').groupby('Name').sum().sort_values(ascending=False).head()
我的答案
df = pd.read_csv('data/script.csv')
df.head(3)
| Release Date | Season | Episode | Episode Title | Name | Sentence |
|---|
| 0 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | What do you expect? They're savages. One lot s... |
|---|
| 1 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | will | I've never seen wildlings do a thing like this... |
|---|
| 2 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | How close did you get? |
|---|
df['Sentence'] = df['Sentence'].str.capitalize()
df[df['Name']=='slave buyer']
| Release Date | Season | Episode | Episode Title | Name | Sentence | count | word_count |
|---|
| 16187 | 2015-05-24 | Season 5 | Episode 7 | The Gift | slave buyer | 20. | 0 | 1 |
|---|
df['count'] = df['Sentence'].str.count('[A-Z]')
df.columns
res = df.groupby('Episode ')['count'].count().to_frame()
res.head()
| count |
|---|
| Episode | |
|---|
| Episode 1 | 2637 |
|---|
| Episode 10 | 1846 |
|---|
| Episode 2 | 2957 |
|---|
| Episode 3 | 2648 |
|---|
| Episode 4 | 2356 |
|---|
df.head()
| Release Date | Season | Episode | Episode Title | Name | Sentence | count | word_count |
|---|
| 0 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | What do you expect? they're savages. one lot s... | 1 | 25 |
|---|
| 1 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | will | I've never seen wildlings do a thing like this... | 1 | 21 |
|---|
| 2 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | How close did you get? | 1 | 5 |
|---|
| 3 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | will | Close as any man would. | 1 | 5 |
|---|
| 4 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | gared | We should head back to the wall. | 1 | 7 |
|---|
df['word_count'] = df['Sentence'].str.count('\s') + 1
df.head()
| Release Date | Season | Episode | Episode Title | Name | Sentence | count | word_count |
|---|
| 0 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | What do you expect? They're savages. One lot s... | 3 | 25 |
|---|
| 1 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | will | I've never seen wildlings do a thing like this... | 2 | 21 |
|---|
| 2 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | waymar royce | How close did you get? | 1 | 5 |
|---|
| 3 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | will | Close as any man would. | 1 | 5 |
|---|
| 4 | 2011-04-17 | Season 1 | Episode 1 | Winter is Coming | gared | We should head back to the wall. | 1 | 7 |
|---|
res = df.groupby('Name')[['count','word_count']].sum()
res['word_average'] = res['word_count']/res['count']
res.sort_values('word_average',ascending=False)
| count | word_count | word_average |
|---|
| Name | | | |
|---|
| slave buyer | 0 | 1 | inf |
|---|
| male singer | 1 | 109 | 109.0 |
|---|
| slave owner | 1 | 77 | 77.0 |
|---|
| lollys stokeworth | 1 | 62 | 62.0 |
|---|
| manderly | 1 | 62 | 62.0 |
|---|
| ... | ... | ... | ... |
|---|
| dothraki woman | 1 | 1 | 1.0 |
|---|
| dornish prince | 1 | 1 | 1.0 |
|---|
| cold | 1 | 1 | 1.0 |
|---|
| little sam | 1 | 1 | 1.0 |
|---|
| doloroud edd | 1 | 1 | 1.0 |
|---|
564 rows × 3 columns
answer_count = df['Sentence'].str.count('\?').tolist()
answer_count = [0] + answer_count
df['answer_count']=answer_count[:-1]
res = df.groupby('Name')['answer_count'].sum().to_frame()
res.sort_values('answer_count',ascending=False).head()
| answer_count |
|---|
| Name | |
|---|
| tyrion lannister | 527 |
|---|
| jon snow | 374 |
|---|
| jaime lannister | 283 |
|---|
| arya stark | 265 |
|---|
| cersei lannister | 246 |
|---|