html2txt h2t.c

该程序将HTML内容转换为纯文本格式,支持处理常见的HTML标签和特殊字符,如版权符号等,并能忽略脚本和样式标签。
/*//////////////////////

文件名:	h2t.c v0.2

作者:	苏晓(suxiaojack)

日期:	2008.7

用途:	转换HTML内容为TXT文本

许可 ( License ):	GPL



v0.2

 处理Bug

1、修正无法识别&#数字;问题 UNICODE=>GB2312

2、添加©和 ® 处理

3、修正&处理死循环.



v0.1



//////////////////////*/



#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <windows.h>

#include <locale.h>



UnicodeToGB2312(char* pOut,unsigned short uData)

{

	WideCharToMultiByte(CP_ACP,NULL,&uData,1,pOut,sizeof(WCHAR),NULL,NULL);

	return;

}



#define BUFSIZE 1024*1024*2



char buf[1024*1024*20];

char shadowbuf[1024*1024*20];

char buffer[BUFSIZE];

long size;

int type=0;



#define tocsize 14



//这个东西太多了!常用的可能也就这些吧。

//Windows控制台太变态!注册商标等一些符号无法输出! 

char* toc[tocsize]={"&nbsp;"," ","&lt;","<","&gt;",">","&quot;","/"","&amp;","&","&copy;","◎版权","&reg;","◎注册"};



void usage(char** argv)

{

	char *us="用来转换html =>txt. ver0.2/n"

			"suxiaojack写于2008.7/n";

	char *ue="tstart_in_tag_text:开始的Tag标记中的特征文字,好理解end_in_tag_text了。/n"

	"jump_num:跳过几次开始找到的,默认为0./n"

	"注意不支持正则式!未曾处理水印文字。/n";

	printf("%s",us);

	printf("使用方法:%s <file> [ <start_in_tag_text> [jump_num] <end_in_tag_text> ] /n",argv[0]);

	printf("%s",ue);

};



//strstr快速比较

int ministrstr(char* s,char* f)

{

	char minibuf[16];

	memcpy(minibuf,s,15);

	minibuf[15]=0;

	return strstr(minibuf,f)-minibuf;

};



//strstr转换为小写快速比较。

int ministrstri(char* s,char *f)

{

	char minibuf[16];

	memcpy(minibuf,s,15);

	minibuf[15]=0;

	strlwr(minibuf);

	return strstr(minibuf,f)-minibuf;

}



// 等标记转换

int isintoc(char* streamstart)

{

	int i=0;

	int ret=0;

	while(i<tocsize)

	{

		if(!ministrstr(streamstart,toc[i]))

		{

			printf("%s",toc[i+1]);

			ret=strlen(toc[i]);

			break;

		};

		i+=2;

	};

	if(ret==0) //没有转换处理

	{

		printf("&");

		ret=1;

	};

	return ret;

};





int num2txt(char* numstart)

{

	char tmp[256];

	int pos=0;

	char* s=numstart;

	unsigned short word;

	char os[3];

	while( *s>='0' && *s <='9' )

	{

		tmp[pos++]=*s++;

	};

	tmp[pos]=0;

	word=atoi(tmp);

	memset(os,0,3);

	UnicodeToGB2312(os,word);

	printf("%s",os);

	//s是;跳过

	s++;

	return  s-numstart;

}



//文件全部进入缓存

void read2buf(FILE* fp)

{

	buf[0]=0;

	size=0;

	while(!feof(fp))

	{

		fgets(buffer,sizeof(buffer),fp);

		strcat(buf+size,buffer);

		size+=strlen(buffer);

	};

	buf[size]=0;

	memcpy(shadowbuf,buf,size+1);

	strlwr(shadowbuf);

};



//找标记的开始位置。返回找到后'>'之后的第一个字符位置。

int findstart(char* start,int jump)

{

	char* pos=shadowbuf;

	strlwr(start);

	do

	{

		pos=strstr(pos,start);

		if(pos-shadowbuf < 0 )return -1;

		pos++;

	}while(jump--);

	while(*pos++ != '>')

	{};

	return pos-shadowbuf;

};

//找标记的结束位置。返回找到后'<'之前的最后字符位置。

int findend(char* end,int start)

{

	char* pos=shadowbuf+start;

	strlwr(end);

	pos=strstr(pos,end);

	if(pos-shadowbuf<0)return -1;

	while(*pos-- != '<')

	{

	};

	return pos-shadowbuf;

};



void printline()

{

	switch(type)

	{

		case 1:

		printf("%c",'/r');

		break;

		case 2:

		printf("%s","/r/n");

		break;

		case 3:

		printf("%c",'/n');

		break;

		default:

		break;

	};

};



//转换输出

void h2t(char* s,int len)

{

	char* ss=s;

	while(ss-s<len)

	{

		//判断一下文章换行符号类型

		if(type==0 && ( *ss=='/r'|| *ss=='/n'))

		{

			if(*ss=='/r' &&*(ss+1)=='/n')

			{

				type=2;

			}else if(*ss=='/n')

			{

				type=3;

			}else

			{

				type=1;

			};

		};

		if(*ss!='<')

		{//非标记

			if(*ss=='&')

			{

				if(*(ss+1)=='#')

				{

					ss+=2;

					int may=num2txt(ss);

						ss+=may;

				}else

				{

					int may=isintoc(ss);

					if(may>0)

					{

						ss+=may;

					};

				}

			}

		    else

			{

				printf("%c",*ss);

				ss++;

			};

		}

		else

		{

			//<script标记

			if(!ministrstri(ss,"<script"))

			{

				ss++;

				findnext:

				while(*ss!='<' && ss-s <len)

				{

					ss++;

				};



				if(ss-s>=len)break;



				while(ministrstri(ss,"</script")!=0 && ss-s<len)

				{

					ss++;

					goto findnext;

				};

				if(ss-s>=len)break;

				while(*ss!='>')ss++;

				ss++;

			}else if(!ministrstri(ss,"<style")) //<style标记

			{

				ss++;

				findnext2:

				while(*ss!='<' && ss-s <len)

				{

					ss++;

				};

				if(ss-s>=len)break;

				while(ministrstri(ss,"</style")!=0 && ss-s<len)

				{

					ss++;

					goto findnext2;

				};

				if(ss-s>=len)break;

				while(*ss!='>')ss++;

				ss++;

			}else if(!ministrstri(ss,"</br>"))

			{

				printline();

				ss+=5;

			}else if(!ministrstri(ss,"</p>"))

			{

				printline();

				ss+=4;

			}else if(!ministrstri(ss,"<br>"))

			{

				printline();

				ss+=4;

			}

			else //普通标记

			{

				while(*ss!='>' && ss-s<len)

				{

					ss++;

				};

				if(ss-s>=len)break;

				ss++;

			};

		};

	};

};



int main(int argc,char *argv[])

{

	FILE* fp=0;

	int start,end,jump;

	if(argc==2)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		h2t(buf,size);

		fclose(fp);

	}else if(argc==4)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		start=findstart(argv[2],0);

	    if(start<0)

		{

			printf("can't find:%s/n",argv[2]);

			exit(1);

		};

		end=findend(argv[3],start);

		if(end<0)

		{

			printf("can't find:%s/n",argv[3]);

			exit(1);

		}

		if(start<end)

		{

			h2t(buf+start,end-start);

		}else

		{

			usage(argv);

		};

		fclose(fp);

	}else if(argc == 5)

	{

		fp=fopen(argv[1],"r");

		if(!fp)

		{

			usage(argv);

			exit(0);

		};

		read2buf(fp);

		jump=atoi(argv[3]);

		start=findstart(argv[2],jump);

		if(start<0)

		{

			printf("can't find:%s/n",argv[2]);

			exit(1);

		};

		end=findend(argv[4],start);

		if(end<0)

		{

			printf("can't find:%s/n",argv[4]);

			exit(1);

		}

		if(start<end)

		{

			h2t(buf+start,end-start);

		}else

		{

			usage(argv);

		};

		fclose(fp);

	}

	else

	{

		usage(argv);

	}



	return 0;

}



/*

与noblank联合使用

h2t filename.htm |noblank >out.txt

*/
elAdmin- 2025-10-29 08:18:33 [main] INFO c.t.s.c.d.management.system.AppRun - Starting AppRun v1.0.8 using Java 17.0.15 on common-data-management-system-75b7965956-wr8bt with PID 7 (/home/ubuntu/server/lib/server.jar started by root in /) elAdmin- 2025-10-29 08:18:33 [main] INFO c.t.s.c.d.management.system.AppRun - The following 3 profiles are active: "common", "k8s", "secret" elAdmin- 2025-10-29 08:18:59 [main] INFO o.a.coyote.http11.Http11NioProtocol - The ["http-nio-8091"] connector has been configured to support HTTP upgrade to [h2c] elAdmin- 2025-10-29 08:18:59 [main] INFO o.a.coyote.http11.Http11NioProtocol - Initializing ProtocolHandler ["http-nio-8091"] elAdmin- 2025-10-29 08:18:59 [main] INFO o.a.catalina.core.StandardService - Starting service [Tomcat] elAdmin- 2025-10-29 08:18:59 [main] INFO o.a.catalina.core.StandardEngine - Starting Servlet engine: [Apache Tomcat/9.0.83] elAdmin- 2025-10-29 08:19:02 [main] INFO c.a.d.s.b.a.DruidDataSourceAutoConfigure - Init DruidDataSource elAdmin- 2025-10-29 08:19:03 [main] INFO c.alibaba.druid.pool.DruidDataSource - {dataSource-1} inited elAdmin- 2025-10-29 08:19:06 [main] INFO o.h.jpa.internal.util.LogHelper - HHH000204: Processing PersistenceUnitInfo [name: default] elAdmin- 2025-10-29 08:19:07 [main] INFO org.hibernate.Version - HHH000412: Hibernate ORM core version 5.6.15.Final elAdmin- 2025-10-29 08:19:08 [main] INFO o.h.annotations.common.Version - HCANN000001: Hibernate Commons Annotations {5.1.2.Final} elAdmin- 2025-10-29 08:19:11 [main] INFO org.hibernate.dialect.Dialect - HHH000400: Using dialect: org.hibernate.dialect.MySQL5InnoDBDialect elAdmin- 2025-10-29 08:19:20 [main] INFO o.h.e.t.j.p.i.JtaPlatformInitiator - HHH000490: Using JtaPlatform implementation: [org.hibernate.engine.transaction.jta.platform.internal.NoJtaPlatform] elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.impl.StdSchedulerFactory - Using default implementation for ThreadExecutor elAdmin- 2025-10-29 08:19:25 [main] INFO o.quartz.core.SchedulerSignalerImpl - Initialized Scheduler Signaller of type: class org.quartz.core.SchedulerSignalerImpl elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.core.QuartzScheduler - Quartz Scheduler v.2.3.2 created. elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.simpl.RAMJobStore - RAMJobStore initialized. elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.core.QuartzScheduler - Scheduler meta-data: Quartz Scheduler (v2.3.2) 'quartzScheduler' with instanceId 'NON_CLUSTERED' Scheduler class: 'org.quartz.core.QuartzScheduler' - running locally. NOT STARTED. Currently in standby mode. Number of jobs executed: 0 Using thread pool 'org.quartz.simpl.SimpleThreadPool' - with 10 threads. Using job-store 'org.quartz.simpl.RAMJobStore' - which does not support persistence. and is not clustered. elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.impl.StdSchedulerFactory - Quartz scheduler 'quartzScheduler' initialized from an externally provided properties instance. elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.impl.StdSchedulerFactory - Quartz scheduler version: 2.3.2 elAdmin- 2025-10-29 08:19:25 [main] INFO org.quartz.core.QuartzScheduler - JobFactory set to: org.springframework.scheduling.quartz.SpringBeanJobFactory@682e422c elAdmin- 2025-10-29 08:19:27 [main] INFO org.redisson.Version - Redisson 3.17.1 elAdmin- 2025-10-29 08:19:29 [redisson-netty-2-12] INFO o.r.c.p.MasterPubSubConnectionPool - 1 connections initialized for dev-redis-all-common-aps1.base-service.svc.cluster.local/10.53.6.108:6379 elAdmin- 2025-10-29 08:19:29 [redisson-netty-2-20] INFO o.r.c.pool.MasterConnectionPool - 24 connections initialized for dev-redis-all-common-aps1.base-service.svc.cluster.local/10.53.6.108:6379 elAdmin- 2025-10-29 08:20:02 [main] INFO o.a.coyote.http11.Http11NioProtocol - Starting ProtocolHandler ["http-nio-8091"] elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.coyote.http11.Http11NioProtocol - The ["http-nio-9090"] connector has been configured to support HTTP upgrade to [h2c] elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.coyote.http11.Http11NioProtocol - Initializing ProtocolHandler ["http-nio-9090"] elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.catalina.core.StandardService - Starting service [Tomcat] elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.catalina.core.StandardEngine - Starting Servlet engine: [Apache Tomcat/9.0.83] elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.c.c.C.[Tomcat-1].[localhost].[/] - Initializing Spring embedded WebApplicationContext elAdmin- 2025-10-29 08:20:03 [main] INFO o.a.coyote.http11.Http11NioProtocol - Starting ProtocolHandler ["http-nio-9090"] elAdmin- 2025-10-29 08:20:03 [main] INFO org.quartz.core.QuartzScheduler - Scheduler quartzScheduler_$_NON_CLUSTERED started. elAdmin- 2025-10-29 08:20:03 [main] INFO c.t.s.c.d.management.system.AppRun - Started AppRun in 93.99 seconds (JVM running for 99.442) elAdmin- 2025-10-29 08:20:04 [main] INFO c.t.s.c.d.m.s.m.q.config.JobRunner - Timing task injection complete elAdmin- 2025-10-29 08:20:04 [main] INFO c.t.s.c.d.management.system.AppRun - --------------------------------------------- elAdmin- 2025-10-29 08:20:04 [main] INFO c.t.s.c.d.management.system.AppRun - Local: http://localhost:8091 elAdmin- 2025-10-29 08:20:04 [main] INFO c.t.s.c.d.management.system.AppRun - Swagger: http://localhost:8091/doc.html elAdmin- 2025-10-29 08:20:04 [main] INFO c.t.s.c.d.management.system.AppRun - --------------------------------------------- elAdmin- 2025-10-29 08:20:12 [http-nio-9090-exec-1] INFO o.a.c.c.C.[Tomcat-1].[localhost].[/] - Initializing Spring DispatcherServlet 'dispatcherServlet' 卡在这里不动了,为什么
10-30
Step1:导入相关请求、解析、存储相关第三方库 import requests from bs4 import BeautifulSoup Step2:创建getHtml()函数实现请求网页 def getHtml(url): hd={'User-Agent':'Molliza/5.0'} r=requests.get(url,headers=hd,timeout=30) r.encoding=r.apparent_encoding print(r.status_code) #print(r.text) return r.text Step3:创建extracHtml()函数实现解析与提取网页信息,并输出该小说所有章节链接 def extrachtml(html): soup=BeautifulSoup(html,'html.parser') novel_dir=soup.select('body > div.main > div.cover-mian-row > div > div > div.catalog-cnt > div.chapter-list > ul > li > a') #print(novel_dir) links=[] for t in novel_dir: links.append(t.get('href')) print(links) return links Step4:创建novel_content() 函数 实现 解析与提取小说章节标题和内容,并输出 前2章的小说标题和内容 def novel_content(ht): '''解析与提取小说内容''' s=BeautifulSoup(ht,'html.parser') selection=s.find('div',class_='read-content') title=selection.find('h2').get_text() print(title) content=selection.find('p').get_text() print(content) save_txt(title,content) Step5:创建save_txt()函数存储下载小说文章 def save_txt(title,content): '''存储小说''' path="/data/workspace/downloadfiles/" with open(f'{path+title}.txt','w',encoding="utf-8",newline="")as f: f.write(content) 填写请求网页、调用getHtml()、extracHtml() url='https://book.xxs8.com/732880/' html=getHtml(url) links=extrachtml(html) for u in links[:2]: ht=getHtml(u) novel_content(ht)结合这个代码来写
最新发布
12-05
1 HTM(HTML)转TXT 所需软件:HTML2TXT.exe 说明:启动软件,单击“添加文件”按钮找到需要转换HTML文件;选择一个输出文件夹,然后单击“开始”按钮即可得到反编译后的HTM文件。 2 CHM转TXT 所需软件:CHM Encoder 说明:启动软件,单击“打开”按钮,找到需要转换的CHM文件;选择一个输出文件夹,然后单击“开始”按钮,即可得到反编译后的HTM文件。接下来执行上面讲过的HTM(HTML)转TXT的步骤,即可得到TXT文件。 3 PDF转TXT 所需软件:PDF Text Converter 说明:启动软件后,先在左上方的浏览器窗口内找到PDF文件所在的文件夹,此时左下方的文件列表窗口中会显示出该文件夹中的PDF文件,选中欲转换的文件,单击工具栏上的“添加文件”按钮,即会在右上方的窗口中列出等待转换的文件,添加完成后,单击齿轮形状的“转换”按钮,即可完成转换转换好的TXT文件保存在源文件所在文件夹内。 4 JAR转TXT 所需软件:WinRAR 说明:只要你的电脑上安装了WinRAR,即可右击JAR文件,直接将其解压缩。此时你需要做的,就是将这些没有扩展名的文件加上TXT的扩展名并合并(快捷的方法下面会有介绍)。 5 UMD转TXT 所需软件:XBookMaker 说明:启动软件后,单击左上方的“Open folder”按钮,定位到包含UMD电子书的文件夹,此时会在左边的的“Files”窗口列出该文件夹内包含的所有UMD格式的电子书,点击一个UMD文件,左下方的“Chapters”窗口中即会显示该电子书中所包含的章节。单击任意一个章节,即可在右边的窗口中看到该章节的内容。现在你只需将其复制并将其粘贴到记事本中,保存为TXT即可, 6 EXE转TXT 所需软件:miniKillEBook 说明:先打开想要转换的EXE电子书,再启动miniKillEBook,勾选“以文本方式保存”复选框,拖动图中的图标到电子书窗口中,单击“开始”按钮,软件即会自动将电子书“逐页翻过”并将每一页的内容存为TXT文本。 刚看到的软件,我比较喜欢用手机看书,这个很方便
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值