如何进行文件解析
csv
读写标准的文件
eggs.csv
Spam, Spam, Spam, Spam, Spam, Baked Beans
Spam, Lovely Spam, Wonderful Spam
- 读取csv文件
import csv
with open('some.csv','rb') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
print ','.join(row)
- 使用备用格式读取
import csv
with open('passwd', 'rb') as f:
reader = csv.reader(f, delimiter=':', quoting=csv.QUOTE_NONE)
for row in reader:
print row
- 使用字典阅读器读取
import csv
with open('names.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
print(row['first_name'],row['last_name'])
- 推断文件格式
with open('example.csv','rb') as csvfile:
dialect = csv.Sniffer().sniff(csvfile.read(1024))
csvfile.seek(0)
reader = csv.reader(csvfile, dialet)
- 写入csv文件
import csv
with open('some.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(someiterable)
- 使用备用格式写入
import csv
with open('some.csv','wb') as csvfile:
writer = csv.writer(csvfile, delimiter=' ',quotechar='|',quoting=csv.QUOTE_MINIMAL)
writerow(['Spam'] * 5 + [ 'Baked Beans'])
writerow(['Spam','Lovely Spam','Wonderful Spam'])
- 注册文件格式(读取协议)
import csv
csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE)
with open('passwd', 'rb') as f:
reader = csv.reader(f, 'unixpwd')
- 使用字典写入器写入
import csv
with open('names.csv', 'w') as csvfile
fieldnames = ['first_name','last_name']
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writerheader()
writer.writerow({'first_name':'Baked','last_name':'Beans'})
writer.writerow({'first_name':'Lovely','last_name':'Spam'})
writer.writerow({{'first_name':'Wonderful','last_name':'Spam'}})
- 解析字符串
import csv
for row in csv.reader(['one,two,three']):
print row
读写其他编码
- 读取文件(unicode编码)
import csv
def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# csv.py 不能处理 Unicode; 临时转码为 UTF-8:
csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
dialect=dialect, **kwargs)
for row in csv_reader:
# 转换 UTF-8 为 Unicode:
yield [unicode(cell, 'utf-8') for cell in row]
def utf_8_encoder(unicode_csv_data):
for line in unicode_csv_data:
yield line.encode('utf-8')
- 读取文件(其他编码)
import csv, codecs, cStringIO
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
configparser
ConfigParser实现一个基本配置文件解析器语言,它提供了一个类似于你会发现在Microsoft Windows INI文件的结构。您可以使用它来编写可由最终用户轻松定制的Python程序。
解析配置文件
写入配置文件
import ConfigParser
config = ConfigParser.RawConfigParser()
# When adding sections or items, add them in the reverse order of
# how you want them to be displayed in the actual file.
# In addition, please note that using RawConfigParser's and the raw
# mode of ConfigParser's respective set functions, you can assign
# non-string values to keys internally, but will receive an error
# when attempting to write to a file or when you get it in non-raw
# mode. SafeConfigParser does not allow such assignments to take place.
config.add_section('Section1')
config.set('Section1', 'an_int', '15')
config.set('Section1', 'a_bool', 'true')
config.set('Section1', 'a_float', '3.1415')
config.set('Section1', 'baz', 'fun')
config.set('Section1', 'bar', 'Python')
config.set('Section1', 'foo', '%(bar)s is %(baz)s!')
# Writing our configuration file to 'example.cfg'
with open('example.cfg', 'wb') as configfile:
config.write(configfile)
读取配置文件
import ConfigParser
config = ConfigParser.RawConfigParser()
config.read('example.cfg')
# getfloat() raises an exception if the value is not a float
# getint() and getboolean() also do this for their respective types
a_float = config.getfloat('Section1', 'a_float')
an_int = config.getint('Section1', 'an_int')
print a_float + an_int
# Notice that the next output does not interpolate '%(bar)s' or '%(baz)s'.
# This is because we are using a RawConfigParser().
if config.getboolean('Section1', 'a_bool'):
print config.get('Section1', 'foo')
- 插值
import ConfigParser
config = ConfigParser.ConfigParser()
config.read('example.cfg')
# Set the third, optional argument of get to 1 if you wish to use raw mode.
print config.get('Section1', 'foo', 0) # -> "Python is fun!"
print config.get('Section1', 'foo', 1) # -> "%(bar)s is %(baz)s!"
# The optional fourth argument is a dict with members that will take
# precedence in interpolation.
print config.get('Section1', 'foo', 0, {'bar': 'Documentation',
'baz': 'evil'})
import ConfigParser
# New instance with 'bar' and 'baz' defaulting to 'Life' and 'hard' each
config = ConfigParser.SafeConfigParser({'bar': 'Life', 'baz': 'hard'})
config.read('example.cfg')
print config.get('Section1', 'foo') # -> "Python is fun!"
config.remove_option('Section1', 'bar')
config.remove_option('Section1', 'baz')
print config.get('Section1', 'foo') # -> "Life is hard!"
- 移动选项
def opt_move(config, section1, section2, option):
try:
config.set(section2, option, config.get(section1, option, 1))
except ConfigParser.NoSectionError:
# Create non-existent section
config.add_section(section2)
opt_move(config, section1, section2, option)
else:
config.remove_option(section1, option)
- 读取无值
import ConfigParser
import io
sample_config = """
[mysqld]
user = mysql
pid-file = /var/run/mysqld/mysqld.pid
skip-external-locking
old_passwords = 1
skip-bdb
skip-innodb
"""
config = ConfigParser.RawConfigParser(allow_no_value=True)
config.readfp(io.BytesIO(sample_config))
# Settings with values are treated as before:
config.get("mysqld", "user")
# Settings without values provide None:
config.get("mysqld", "skip-bdb")
# Settings which aren't specified still raise an error:
config.get("mysqld", "does-not-exist")
robotparser
robots.txt的解析器:回答有关特定用户代理是否可以在发布该robots.txt文件的网站上获取URL的问题
import robotparser
rp = robotparser.RobotFileParser()
rp.set_url("http://www.musi-cal.com/robots.txt")
rp.read()
rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
rp.can_fetch("*", "http://www.musi-cal.com/")
netrc
处理netrc文件
xdrlib
编码和解码xdr数据
plistlib
生成和解析Mac OS X .plist文件