【脚本语言系列】关于文件解析(Python)

最新推荐文章于 2023-10-09 11:59:43 发布

原创最新推荐文章于 2023-10-09 11:59:43 发布 · 415 阅读

0 ·

CC 4.0 BY-SA版权

脚本语言专栏收录该内容

129 篇文章

订阅专栏

博客介绍了多种文件解析方法，涵盖csv文件的读写，包括标准文件、其他编码文件的处理；configparser可实现配置文件的读写；还有robotparser用于解析robots.txt，netrc处理netrc文件，xdrlib进行xdr数据编解码，plistlib解析Mac OS X.plist文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

如何进行文件解析

csv

读写标准的文件

eggs.csv

Spam, Spam, Spam, Spam, Spam, Baked Beans
Spam, Lovely Spam, Wonderful Spam

读取csv文件

import csv
with open('some.csv','rb') as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
		print ','.join(row)

使用备用格式读取

import csv
with open('passwd', 'rb') as f:
    reader = csv.reader(f, delimiter=':', quoting=csv.QUOTE_NONE)
    for row in reader:
        print row

使用字典阅读器读取

import csv
with open('names.csv') as csvfile:
	reader = csv.DictReader(csvfile)
	for row in reader:
		print(row['first_name'],row['last_name'])

推断文件格式

with open('example.csv','rb') as csvfile:
	dialect = csv.Sniffer().sniff(csvfile.read(1024))
	csvfile.seek(0)
	reader = csv.reader(csvfile, dialet)

写入csv文件

import csv
with open('some.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(someiterable)

使用备用格式写入

import csv
with open('some.csv','wb') as csvfile:
	writer = csv.writer(csvfile, delimiter=' ',quotechar='|',quoting=csv.QUOTE_MINIMAL)
	writerow(['Spam'] * 5 + [ 'Baked Beans'])
	writerow(['Spam','Lovely Spam','Wonderful Spam'])

注册文件格式（读取协议）

import csv
csv.register_dialect('unixpwd', delimiter=':', quoting=csv.QUOTE_NONE)
with open('passwd', 'rb') as f:
    reader = csv.reader(f, 'unixpwd')

使用字典写入器写入

import csv

with open('names.csv', 'w') as csvfile
	fieldnames = ['first_name','last_name']
	writer = csv.DictWriter(csvfile,fieldnames=fieldnames)

	writer.writerheader()
	writer.writerow({'first_name':'Baked','last_name':'Beans'})
	writer.writerow({'first_name':'Lovely','last_name':'Spam'})
	writer.writerow({{'first_name':'Wonderful','last_name':'Spam'}})

解析字符串

import csv
for row in csv.reader(['one,two,three']):
	print row

读写其他编码

读取文件（unicode编码）

import csv

def unicode_csv_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
    # csv.py 不能处理 Unicode; 临时转码为 UTF-8:
    csv_reader = csv.reader(utf_8_encoder(unicode_csv_data),
                            dialect=dialect, **kwargs)
    for row in csv_reader:
        # 转换 UTF-8 为 Unicode:
        yield [unicode(cell, 'utf-8') for cell in row]

def utf_8_encoder(unicode_csv_data):
    for line in unicode_csv_data:
        yield line.encode('utf-8')

读取文件（其他编码）

import csv, codecs, cStringIO

class UTF8Recoder:
    """
    Iterator that reads an encoded stream and reencodes the input to UTF-8
    """
    def __init__(self, f, encoding):
        self.reader = codecs.getreader(encoding)(f)

    def __iter__(self):
        return self

    def next(self):
        return self.reader.next().encode("utf-8")

class UnicodeReader:
    """
    A CSV reader which will iterate over lines in the CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        f = UTF8Recoder(f, encoding)
        self.reader = csv.reader(f, dialect=dialect, **kwds)

    def next(self):
        row = self.reader.next()
        return [unicode(s, "utf-8") for s in row]

    def __iter__(self):
        return self

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)

configparser

ConfigParser实现一个基本配置文件解析器语言，它提供了一个类似于你会发现在Microsoft Windows INI文件的结构。您可以使用它来编写可由最终用户轻松定制的Python程序。
解析配置文件

写入配置文件

import ConfigParser

config = ConfigParser.RawConfigParser()

# When adding sections or items, add them in the reverse order of
# how you want them to be displayed in the actual file.
# In addition, please note that using RawConfigParser's and the raw
# mode of ConfigParser's respective set functions, you can assign
# non-string values to keys internally, but will receive an error
# when attempting to write to a file or when you get it in non-raw
# mode. SafeConfigParser does not allow such assignments to take place.
config.add_section('Section1')
config.set('Section1', 'an_int', '15')
config.set('Section1', 'a_bool', 'true')
config.set('Section1', 'a_float', '3.1415')
config.set('Section1', 'baz', 'fun')
config.set('Section1', 'bar', 'Python')
config.set('Section1', 'foo', '%(bar)s is %(baz)s!')

# Writing our configuration file to 'example.cfg'
with open('example.cfg', 'wb') as configfile:
    config.write(configfile)

读取配置文件

import ConfigParser

config = ConfigParser.RawConfigParser()
config.read('example.cfg')

# getfloat() raises an exception if the value is not a float
# getint() and getboolean() also do this for their respective types
a_float = config.getfloat('Section1', 'a_float')
an_int = config.getint('Section1', 'an_int')
print a_float + an_int

# Notice that the next output does not interpolate '%(bar)s' or '%(baz)s'.
# This is because we are using a RawConfigParser().
if config.getboolean('Section1', 'a_bool'):
    print config.get('Section1', 'foo')

插值

import ConfigParser

config = ConfigParser.ConfigParser()
config.read('example.cfg')

# Set the third, optional argument of get to 1 if you wish to use raw mode.
print config.get('Section1', 'foo', 0)  # -> "Python is fun!"
print config.get('Section1', 'foo', 1)  # -> "%(bar)s is %(baz)s!"

# The optional fourth argument is a dict with members that will take
# precedence in interpolation.
print config.get('Section1', 'foo', 0, {'bar': 'Documentation',
                                        'baz': 'evil'})

import ConfigParser

# New instance with 'bar' and 'baz' defaulting to 'Life' and 'hard' each
config = ConfigParser.SafeConfigParser({'bar': 'Life', 'baz': 'hard'})
config.read('example.cfg')

print config.get('Section1', 'foo')  # -> "Python is fun!"
config.remove_option('Section1', 'bar')
config.remove_option('Section1', 'baz')
print config.get('Section1', 'foo')  # -> "Life is hard!"

移动选项

def opt_move(config, section1, section2, option):
    try:
        config.set(section2, option, config.get(section1, option, 1))
    except ConfigParser.NoSectionError:
        # Create non-existent section
        config.add_section(section2)
        opt_move(config, section1, section2, option)
    else:
        config.remove_option(section1, option)

读取无值

import ConfigParser
import io

sample_config = """
[mysqld]
user = mysql
pid-file = /var/run/mysqld/mysqld.pid
skip-external-locking
old_passwords = 1
skip-bdb
skip-innodb
"""
config = ConfigParser.RawConfigParser(allow_no_value=True)
config.readfp(io.BytesIO(sample_config))

# Settings with values are treated as before:
config.get("mysqld", "user")

# Settings without values provide None:
config.get("mysqld", "skip-bdb")

# Settings which aren't specified still raise an error:
config.get("mysqld", "does-not-exist")

robotparser

robots.txt的解析器:回答有关特定用户代理是否可以在发布该robots.txt文件的网站上获取URL的问题

import robotparser
rp = robotparser.RobotFileParser()
rp.set_url("http://www.musi-cal.com/robots.txt")
rp.read()
rp.can_fetch("*", "http://www.musi-cal.com/cgi-bin/search?city=San+Francisco")
rp.can_fetch("*", "http://www.musi-cal.com/")