第二十三章:文本处理 习题答案
本章习题所用到的头文件和实现
Text.h
#include <string>
#include <vector>
#include <map>
#include <fstream>
#include <sstream>
#include <iostream>
#include <regex>
#include <exception>
using namespace std;
const string Mail_delimeter{ "----" };
typedef vector<string>::const_iterator Line_iter;
class Message { //一个 Message 指向一封邮件的首行和末行
Line_iter first;
Line_iter last;
public:
Message(Line_iter p1, Line_iter p2)
:first(p1), last(p2) {}
Line_iter begin() const { return first; }
Line_iter end() const { return last; }
//...
};
//Mail_file 类保存文本行和邮件
using Mess_iter = vector<Message>::const_iterator;
struct Mail_file //一个Mail_file保存来自文件的所有行并简化了对邮件的访问
{
string name; //文件名
vector<string> lines; //按顺序保存的行
vector<Message> m; //按顺序保存的邮件
Mail_file(const string& fname); //读取文件 fname,保存到 lines 中
Mess_iter begin() const { return m.begin(); }
Mess_iter end() const { return m.end(); }
};
//在一个Message中查找发件人姓名
//若找到返回true,并将发件人姓名放入 s 中
bool find_from_addr(const Message* pm, string& s);
//在一个Message中查找是否有主题域
//如果有的话,返回Message主题,否则返回空字符串
string find_subject(const Message* pm);
//输出完整邮件内容
void print_mail(ostream& os, const Message* pm);
// std_lib_facilities.h 中有这个实现
//template<typename T>
//string to_string(const T& t)
//{
// ostringstream os;
// os << t;
// return os.str();
//}
struct bad_from_string : std::bad_cast
{
const char* what() const override
{
return "bad cast from string";
}
};
template<typename T>
T from_string(const string& s)
{
istringstream is{ s };
T t;
if (!(is >> t)) throw bad_from_string{};
return t;
}
template<typename Target, typename Source>
Target to(Source arg)
{
stringstream interpreter;
Target result;
if (!(interpreter << arg) //将arg写入到流
|| !(interpreter >> result) //从流读取到result
|| !(interpreter >> std::ws).eof()) //流中还有内容
throw runtime_error("to<>() failed");
return result;
}
Text.cpp
#include "23_Text.h"
Mail_file::Mail_file(const string& fname)
:name{ fname }
{
ifstream ifs{ fname };
if (!ifs)
{
cerr << "no " << fname << '\n';
exit(1); //终止程序
}
for (string s; getline(ifs, s);) //创建文本行的 vector
lines.push_back(s);
auto first = lines.begin(); //创建 Message 的 vector
for (auto p = lines.begin(); p != lines.end(); ++p)
{
if (*p == Mail_delimeter) //标识邮件结束
{
if (first != p) //处理出现多行 ---- 的情况
m.push_back(Message{ first,p });
first = p + 1; // ---- 不是邮件的一部分
}
}
}
//-------------------------------------------------------------------------------
static int is_prefix(const string& s, const string& p)
{
//p 是 s 的第一部分?
int n = p.size();
if (string(s, 0, n) == p)
return n;
else
return 0;
}
//bool find_from_addr(const Message* pm, string& s)
//{
// for (const auto& x : *pm)
// if (int n = is_prefix(x, "From: "))
// {
// s = string(x, n);
// return true;
// }
// return false;
//}
//
//string find_subject(const Message* pm)
//{
// for (const auto& x : *pm)
// if (int n = is_prefix(x, "Subject: "))
// return string(x, n);
// return "";
//}
//Exercise 23-3------------------------------------------------------------------
bool find_from_addr(const Message* pm, string& s)
{
regex pat{ R"(^From:\s?(.+)$)" };
for (const auto& x : *pm)
{
smatch matches;
if (regex_match(x, matches, pat))
{
s = matches[1];
return true;
}
}
return false;
}
string find_subject(const Message* pm)
{
regex pat{ R"(^Subject:\s?(.+)$)" };
for (const auto& x : *pm)
{
smatch matches;
if (regex_match(x, matches, pat))
return matches[1];
}
return "";
}
//Exercise 23-3 End------------------------------------------------------------------
void print_mail(ostream& os, const Message* pm)
{
for (const auto& x : *pm)
os << x << '\n';
}
23.1 and 23.2
#include "23_Text.h"
int main()
{
cout << "Enter an email file to process:\n";
string fname;
cin >> fname;
Mail_file mfile{ fname }; //从一个文件读取数据初始化 mfile
//将来自每个发件人的邮件收集在一起,保存在一个 multimap 中
multimap<string, const Message*> sender;
//用于保存主题的 multimap
multimap<string, const Message*> subject;
for(const auto& m : mfile)
{
string s;
if (find_from_addr(&m, s))
sender.insert(make_pair(s, &m));
//Exercise 23_2
if ((s = find_subject(&m)).size() != 0) //如果有找到主题
subject.insert(make_pair(s, &m));
}
//现在遍历 multimap,提取 John Doe 的邮件的主题
auto pp = sender.equal_range("John Doe <jdoe@machine.example>");
for (auto p = pp.first; p != pp.second; ++p)
cout << find_subject(p->second) << '\n';
cin.ignore(); //先把之前输入的最后换行符给去掉
string subj; //用于接收主题
getline(cin, subj);
auto pp2 = subject.equal_range(subj);
for(auto p = pp2.first; p != pp2.second; ++p)
{
print_mail(cout, p->second);
cout << Mail_delimeter << '\n';
}
return 0;
}
23.3 and 23.4
#include "23_Text.h"
int main()
{
cout << "Enter an email file to process:\n";
string fname;
cin >> fname;
Mail_file mfile{ fname }; //从一个文件读取数据初始化 mfile
//将来自每个发件人的邮件收集在一起,保存在一个 multimap 中
multimap<string, const Message*> sender;
for (const auto& m : mfile)
{
string s;
if (find_from_addr(&m, s))
sender.insert(make_pair(s, &m));
}
//现在遍历 multimap,提取 John Doe 的邮件的主题
auto pp = sender.equal_range("John Doe <jdoe@machine.example>");
for (auto p = pp.first; p != pp.second; ++p)
cout << find_subject(p->second) << '\n';
//指定发件人
cout << "Enter sender: ";
cin.ignore(); //先把之前输入的最后换行符给去掉
string sdr;
getline(cin, sdr);
auto pp2 = sender.equal_range(sdr);
for (auto p = pp2.first; p != pp2.second; ++p)
cout << find_subject(p->second) << '\n';
return 0;
}
23.5
#include "23_Text.h"
#include <unordered_map>
#include <chrono>
int main()
{
cout << "Enter an email file to process:\n";
string fname;
cin >> fname;
Mail_file mfile{ fname }; //从一个文件读取数据初始化 mfile
//将来自每个发件人的邮件收集在一起,保存在一个 multimap 中
multimap<string, const Message*> sender;
for (const auto& m : mfile)
{
string s;
if (find_from_addr(&m, s))
sender.insert(make_pair(s, &m));
}
//将来自每个发件人的邮件收集在一起,保存在一个 unordered_multimap 中
unordered_multimap<string, const Message*> sender2;
for (const auto& m : mfile)
{
string s;
if (find_from_addr(&m, s))
sender2.insert(make_pair(s, &m));
}
//分别测试 multimap 和 unordered_multimap 的输出时间
auto t1 = chrono::system_clock::now();
for (const auto& p : sender)
print_mail(cout, p.second);
auto t2 = chrono::system_clock::now();
cout << "Multimap took "
<< chrono::duration_cast<chrono::milliseconds>(t2 - t1).count()
<< " milliseconds\n";
auto t3 = chrono::system_clock::now();
for (const auto& p : sender2)
print_mail(cout, p.second);
auto t4 = chrono::system_clock::now();
cout << "Unordered_multimap took "
<< chrono::duration_cast<chrono::milliseconds>(t4 - t3).count()
<< " milliseconds\n";
return 0;
}
23.6
#include <iomanip>
#include "23_Text.h"
//文本文件中的日期格式:月/日/年,比如12/24/2000, 1/1/2022
//月的范围[1,12],暂不检查日和年的合法性
int main()
try
{
cout << "Enter text file name: ";
string fname;
cin >> fname;
ifstream ifs{ fname };
if (!ifs)
throw runtime_error{ "Open file failed" };
regex pat_date{ R"((\d{1,2})/(\d{1,2})/(\d{4}))" };
unsigned lineno{ 0 };
for(string s; getline(ifs,s);)
{
++lineno;
smatch matches;
if(regex_search(s,matches,pat_date))
{
if (4 == matches.size()) //可以不用这个测试
{
int month{to<int,string>(matches[1])};
if (month >= 1 && month <= 12)
cout << setw(3) << lineno << ": " << s << '\n';
}
}
}
return 0;
}
catch(exception& e)
{
cerr << e.what() << endl;
}
23.8
#include <iostream>
#include <stdexcept>
#include <regex>
#include <string>
#include <fstream>
int main()
try {
std::string pat;
std::cout << "enter pattern: ";
std::getline(std::cin, pat); // read pattern
std::regex pattern;
try {
pattern = pat; // this checks pat
std::cout << "pattern: " << pat << '\n';
}
catch (std::regex_error) {
std::cout << pat << " is not a valid regular expression\n";
exit(1);
}
std::cout << "now enter text file name: ";
std::string fname;
std::ifstream ifs{ fname };
if (!ifs)
throw std::runtime_error{ "Open file failed" };
int lineno = 0;
for (std::string line; std::getline(std::cin, line); ) {
++lineno;
std::smatch matches;
if (std::regex_search(line, matches, pattern)) {
std::cout << "line " << lineno << ": " << line << '\n';
}
}
}
catch (std::exception& e) {
std::cerr << "Exception: " << e.what() << '\n';
return 1;
}
catch (...) {
std::cerr << "Unknown exception\n";
return 2;
}
23.9
#include "../../std_lib_facilities.h"
#include "23_Text.h"
int main()
try {
//获取文件名
cout << "Enter table file name: ";
string fname;
cin >> fname;
ifstream in{ fname }; //输入文件
if (!in) error("no input file");
string line; //输入缓冲区
regex header{ R"(^[\w ]+(\t[\w ]+)*$)" }; //文件头行正则模式
regex row{ R"(^[\w ]+(\t\d+)(\t\d+)(\t\d+)$)" }; //数据行正则模式
if(getline(in,line)) //检查文件头行
{
smatch matches;
if (!regex_match(line, matches, header))
error("no header");
}
int lineno {0};
//列合计
int boys{ 0 };
int girls{ 0 };
while(getline(in,line))
{
++lineno;
smatch matches;
if (!regex_match(line, matches, row))
error("bad line: " + to_string(lineno));
//检查行
int curr_boy = from_string<int>(matches[1]);
int curr_girl = from_string<int>(matches[2]);
int curr_total = from_string<int>(matches[3]);
if (curr_boy + curr_girl != curr_total)
error("bad row sum");
//if(matches[1] == "Alle klasser") //最后一行
if (in.eof() || (in >> ws).eof()) //最后一行
{ cout << "at eof\n";
if (curr_boy != boys)
error("boys don't add up");
if (curr_girl != girls)
error("girls don't add up");
/*if (!(in >> ws).eof())
error("characters after total line");*/
return 0;
}
//更新合计
boys += curr_boy;
girls += curr_girl;
}
error("didn't find total line");
}
catch (std::exception& e) {
std::cerr << "Exception: " << e.what() << '\n';
return 1;
}
catch (...) {
std::cerr << "Unknown exception\n";
return 2;
}
23.10
这题感觉代码实现得挺丑陋的。。。
#include <set>
#include "../../std_lib_facilities.h"
#include "23_Text.h"
//一个年级的男女同学数量和总和
struct Grade_students
{
int grade;
int boys;
int girls;
int total; //本年级的所有学生
};
ostream& operator<<(ostream& os, const Grade_students& gs)
{
os << gs.grade
<< '\t' << gs.boys
<< '\t' << gs.girls
<< '\t' << gs.total;
return os;
}
int main()
try {
//获取文件名
cout << "Enter table file name: ";
string fname;
cin >> fname;
ifstream in{ fname }; //输入文件
if (!in) error("no input file");
string line; //输入缓冲区
regex header{ R"(^([\w ]+)(\t[\w ]+)*$)" }; //文件头行正则模式
regex row{ R"(^([\w ]+)(\t\d+)(\t\d+)(\t\d+)$)" }; //数据行正则模式
regex colum_1{ R"((\d+)\w+)" }; //第一列带数字的正则模式
if (getline(in, line)) //检查文件头行
{
smatch matches;
if (!regex_match(line, matches, header))
error("no header");
}
string tbl_hdr{ line }; //表头
constexpr int REST_grd { -1 }; //剩余没有年级的为一组
const string REST_str{ "REST"}; //剩余没有年级的为一组
Grade_students REST_students;
map<int, Grade_students> students; //统计学生,合并同年级的班级学生
int lineno{ 0 };
//列合计
int boys{ 0 };
int girls{ 0 };
while (getline(in, line))
{
++lineno;
smatch matches;
if (!regex_match(line, matches, row))
cerr << "bad line: " << lineno << '\n';
if (in.eof()) cout << "at eof\n";
//检查行
int curr_boy = from_string<int>(matches[2]);
int curr_girl = from_string<int>(matches[3]);
int curr_total = from_string<int>(matches[4]);
if (curr_boy + curr_girl != curr_total)
error("bad row sum");
//添加/合并进set
smatch matches2;
string str_col_1 = matches[1];
if(regex_match(str_col_1, matches2, colum_1)) //如果第一列匹配
{
int grd = from_string<int>(matches[1]);
auto p = students.find(grd);
if (p == students.end())
students[grd] = Grade_students{ grd,curr_boy,curr_girl,curr_total };
else
{
students[grd].boys += curr_boy;
students[grd].girls += curr_girl;
students[grd].total += curr_total;
}
}
else if(matches[1] == REST_str)
{
REST_students = { REST_grd,curr_boy,curr_girl,curr_total };
}
else if (matches[1] == "Alle klasser") //最后一行
{
if (curr_boy != boys)
error("boys don't add up");
if (curr_girl != girls)
error("girls don't add up");
if (!(in >> ws).eof())
error("characters after total line");
//输出新的表格到文件中
string ofname;
cout << "Enter ouput file name: ";
cin >> ofname;
ofstream out{ ofname }; //输入文件
if (!out) error("can't open output file");
out << tbl_hdr << '\n';
for (const auto& record : students)
out << record.second << '\n';
out << REST_str
<< '\t' << REST_students.boys
<< '\t' << REST_students.girls
<< '\t' << REST_students.total
<< '\n';
out << matches[1]
<< matches[2]
<< matches[3]
<< matches[4]
<< '\n';
return 0;
}
//更新合计
boys += curr_boy;
girls += curr_girl;
}
error("didn't find total line");
}
catch (std::exception& e) {
std::cerr << "Exception: " << e.what() << '\n';
return 1;
}
catch (...) {
std::cerr << "Unknown exception\n";
return 2;
}
23.12
#include "23_Text.h"
//文本文件中的日期格式:月/日/年,比如12/24/2000, 1/1/2022
//月的范围[1,12],暂不检查日和年的合法性
//ISO标准 yyyy-mm-dd
string replace_to_iso_date(const string& line)
{
//用三次regex_replace,效率有些低
//第一次先换到正确位置
regex pat_date1 { R"((\d{1,2})/(\d{1,2})/(\d{4}))" };
string fmt1{ "$3-$1-$2" };
string s1 = regex_replace(line, pat_date1, fmt1);
//第二次给月份补零(如果月份不是2位数字的话)
regex pat_date2{ R"((\d{4})-(\d)-(\d{1,2}))" };
string fmt2{ "$1-0$2-$3" };
string s2 = regex_replace(s1, pat_date2, fmt2);
//第二次给日期补零(如果日期不是2位数字的话)
//注意最后的日期这边,有三种情况,一、1个 d 配上其他字符,二、1个 d 后面没有字符,
//三、2个 d。我们匹配一和二两种情况
regex pat_date3{ R"((\d{4})-(\d{2})-(\d)(\D|$))" };
string fmt3{ "$1-$2-0$3$4" };
return regex_replace(s2, pat_date3, fmt3);
}
int main()
try
{
cout << "Enter input file name: ";
string ifname;
cin >> ifname;
cout << "Enter output file name: ";
string ofname;
cin >> ofname;
ifstream ifs{ ifname };
if (!ifs)
throw runtime_error{ "Open input file failed" };
ofstream ofs{ ofname };
if (!ofs)
throw runtime_error{ "Open output file failed" };
regex pat_date{ R"(^(.*?)(\d{1,2})/(\d{1,2})/(\d{4}))" }; //法2,其中.*?是惰性匹配,而.*是贪婪匹配
//输出
for (string line; getline(ifs, line);)
{
//法1
ofs << replace_to_iso_date(line) << '\n';
//法2
//smatch matches;
//if (regex_search(line, matches, pat_date))
//{
// do {
// int month{ to<int,string>(matches[2]) };
// /*if (month < 1 || month > 12)
// throw runtime_error("Invalid month");*/
// //接下来转换到ISO日历,并按序补成两位数的月份和日期
// ofs << matches[1] << matches[4] << '-';
// if (month < 10)
// ofs << '0';
// ofs << month << '-';
// if (matches[3].str().size() == 1)
// ofs << '0';
// ofs << matches[3];
// line = matches.suffix().str(); //格式化日期之后余下的字符串需要再次进行模式匹配
// } while (regex_search(line, matches, pat_date));
//}
//ofs << line << '\n';
}
return 0;
}
catch (exception& e)
{
cerr << e.what() << endl;
}
23.13
#include "../../std_lib_facilities.h"
int main()
try
{
string s{ "\n" };
regex pat{ R"(.)" };
smatch matches;
if (regex_match(s, matches, pat))
cout << ". can match \'\\n\'\n";
else
cout << ". can match \'\\n\'\n";
return 0;
}
catch(exception& e)
{
cerr << e.what() << endl;
return 1;
}
23.14
#include <iostream>
#include <stdexcept>
#include <string>
#include <fstream>
#include <regex>
using namespace std;
int main()
try {
std::string fname;
cout << "Enter input file name: ";
cin >> fname;
std::ifstream ifs{ fname };
if (!ifs) {
std::cerr << "Could not read from file " << fname << '\n';
exit(1);
}
cin>>ws; //把空白符都除掉,包括换行符
std::string file_str;
for (std::string s; std::getline(ifs, s); )
file_str += s + '\n';
while (true) {
std::regex pattern;
std::cout << "enter pattern to test:\n";
std::string pat;
std::getline(std::cin, pat);
try {
pattern = pat; // test pattern
std::cout << "pattern: " << pat << '\n';
}
catch (std::regex_error) {
std::cerr << pat << " is not a valid regular expression\n";
exit(1);
}
std::smatch matches;
if (std::regex_search(file_str, matches, pattern)) {
std::cout << "Match!\n";
for (const auto& m : matches)
std::cout << '\t' << m << '\n';
}
else {
std::cout << "No match..\n";
}
}
}
catch (std::exception& e) {
std::cerr << "Exception: " << e.what() << '\n';
return 1;
}
catch (...) {
std::cerr << "Unknown exception\n";
return 2;
}