火车头采集是一款基于Python语言开发的网络爬虫工具,用于快速高效地从互联网上采集数据并存储到本地或远程数据库。它简单易用且功能强大,在各行各业广泛应用。
火车头采集器AI伪原创PHP源码:
<?php
header("Content-type: text/html; charset=gb2312");
set_time_limit(0);
error_reporting(E_ALL);
ini_set('display_errors', '1');
define ("CUR_DIR", '../');
define('TITLE_SEPAR', 'xxxxx');
// 这里是你的API地址,需要到 www.xiaofamao.com 申请
define('API_URL', 'http://api.xiaofamao.com/api.php?json=0&v=2&key=yuyu');
// 待执行目录
define('CUR_FOLDER', CUR_DIR.'word');
// 执行后保存目录
define('DEST_FOLDER', CUR_DIR.'xiaofamao');
// 执行后保存目录 失败
define('DEST_FOLDER_FAIL', CUR_DIR.'xiaofamao_fail');
$one_file = get_one_file();
// 判断是否还有任务
if (!$one_file){
echo PHP_EOL.'* 任务数为0, 程序已退出';
exit;
}
while ($one_file) {
echo PHP_EOL.'* 正在执行:' . PHP_EOL. file_path($one_file);
echo PHP_EOL.'* 请耐心等候...';
$data_arr = get_contents_filter(file_path($one_file));
$title = $data_arr['title'];
$title_src = $data_arr['title'];
$content = $data_arr['content'];
$content = replace_shuminghao($content);
$content = get_utf8_data($content);
$new_content = curl_request(API_URL, array('wenzhang'=>$content));
$new_content = get_gbk_data($new_content);
$new_content = restore_shuminghao($new_content);
#var_dump($new_content);
$new_content = content_format($new_content);
$new_content = $new_content;
#$new_title = get_ai_title($title);
$title = get_utf8_data($title);
$new_title = $title;
//$new_title =curl_request(API_URL, array('wenzhang'=>$title));
//$new_title = get_gbk_data($new_title);
#echo $new_title;
# 如果标题伪原创成功
if ($new_title === 'xx'.$title) {
$new_title = get_gbk_data($title);
//save_new_file(DEST_FOLDER_FAIL.DIRECTORY_SEPARATOR.$new_title.'.txt', $new_content);
save_new_file(DEST_FOLDER_FAIL.DIRECTORY_SEPARATOR.$title_src.'.txt', $new_content);
}
else {
$new_title = strip_tags($new_title);
$new_title = str_replace(':', ":", $new_title);
$new_title = str_replace('?', "?", $new_title);
$new_title = str_replace(array('*','"','<','>','|'),'_', $new_title);
$new_title = get_gbk_data($new_title);
#$title = get_gbk_data($title);
#save_new_file(DEST_FOLDER.DIRECTORY_SEPARATOR.$new_title.'.txt', $new_content);
save_new_file(DEST_FOLDER.DIRECTORY_SEPARATOR.$title_src.'.txt', $new_content);
#save_new_file(DEST_FOLDER.DIRECTORY_SEPARATOR.$new_title.'.txt', $content);
}
//
//
#save_new_file(DEST_FOLDER.DIRECTORY_SEPARATOR.$title.'.txt', $new_content);
file_ok($one_file);
//show_info_gb2312('伪原创结果:'.PHP_EOL. $final_data);
//save_new_file(DEST_FOLDER.DIRECTORY_SEPARATOR.$new_title, $new_content);
//var_dump($final_data);
// sleep(3);
$one_file = get_one_file();
// show_reflesh();
}
show_info_gb2312(PHP_EOL.'任务完成'.PHP_EOL);
function replace_shuminghao($content) {
$content = str_replace(';
$count = preg_match_all("/<\((.*?)\)/", $content, $matches);
#var_dump($matches[0]);
if (isset($matches[0][0]))
{
foreach ($matches[0] as $key => $value) {
#echo $value;
$new_val = '《'.$matches[1][$key].'》';
$content = str_replace($value, $new_val, $content);
}
}
#echo $content;
return $content;
}
function restore_shuminghao($content) {
$count = preg_match_all("/《http(.*?)》/", $content, $matches);
#var_dump($matches[0]);
if (isset($matches[0][0]))
{
foreach ($matches[0] as $key => $value) {
$new_val = '';
$content = str_replace($value, $new_val, $content);
}
#return $matches[0];
}
return $content;
}
function content_rewrite($content) {
$data = curl_request(API_URL, array('wenzhang'=>$content));
return $data;
}
function content_format($data) {
$data = fix_newline($data);
$data_arr = explode(PHP_EOL, $data);
$ret_str = '';
foreach($data_arr as $key =>