collector

<?php set_time_limit(0); header("Content-type:text/html;charset=utf8"); /** * 采集程序类 * @author Administrator * */ class Collector { public $pages = array(); public $result = array();//结果 public $startUrls = array();//第一层链接 public $timeout = 80; public $httpContent; public $httpHead = array();//文件头 public $putHead = array();//自定底的文件头 public $fields = array();//采集的字段 public $deepth; //采集层次数 public $layout_arr;//层次结构 public $limit =0 ;//采集限制条数 public $runtime = 0;//程序运行时间 public $charset = 'UTF-8'; public $httpreferer; public $pagelimit = 0; public $filepath = './'; /** * 运行采集 * * @return array */ function run() { $begintime = $this->microtime_float (); $cnt = 1; foreach ( $this->startUrls as $starturl ) { /** * 解析出起始地址中的页码区间 */ if (preg_match ('~\{(\d+),(\d+)\}~', $starturl, $pagenum )) { $pagebegin = intval ( $pagenum [1] ); $pageend = intval ( $pagenum [2] ); for(; $pagebegin <= $pageend; $pagebegin ++) { $starturl = str_replace ( $pagenum [0], $pagebegin, $starturl ); $urllists = $this->getLists ( $this->layout_arr [0] ['pattern'], $this->getContent ( $starturl ) ); foreach ( $urllists as $url ) { if (($this->limit > 0 && $cnt <= $this->limit) || $this->limit == 0) { $this->filterContent ( $this->getContent ( $url, $starturl ) ); $cnt ++; } } } } else { $urllists = $this->getLists ( $this->layout_arr [0] ['pattern'], $this->getContent ( $starturl ) ); foreach ( $urllists as $url ) { if (($this->limit > 0 && $cnt <= $this->limit) || $this->limit == 0) { $this->filterContent ( $this->getContent ( $url, $starturl ) ); $cnt ++; } } } } $this->runtime = $this->microtime_float () - $begintime; return $this->result; } /** * 从文字段中根据规则提取出url列表 * * @param string $pattern * @param string $content * @return Array */ function getLists($pattern = '', $content = '') { if (strpos ( $pattern, '{*}' ) === false) return array ( $pattern ); $pattern = preg_quote ( $pattern ); $pattern = str_replace ( '\{\*\}', '([^\'\">]*)', $pattern ); $pattern = '~' . $pattern . '~is'; preg_match_all ( $pattern, $content, $preg_rs ); return array_unique ( $preg_rs [0] ); } /** * 获取指定url的html内容包括头 * * @param string $url * @return string */ function getContent($url, $referer = '') { $url = $this->urlRtoA ( $url, $referer ); preg_match ( '/(http:\/\/)([^:\/]*):?(\d*)(\/?.*)/i', $url, $preg_rs ); $host = $preg_rs [2]; $port = empty ( $preg_rs [3] ) ? 80 : $preg_rs [3]; $innerUrl = $preg_rs [4]; $fsp = fsockopen ( $host, $port, $errno, $errstr, $this->timeout ); if (! $fsp) $this->log ( $errstr . '(' . $errno . ')' ); $output = "GET $url HTTP/1.0\r\nHost: $host\r\n"; if (! isset ( $this->putHead ['Accept'] )) $this->putHead ['Accept'] = "*/*"; if (! isset ( $this->putHead ['User-Agent'] )) $this->putHead ['User-Agent'] = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2)'; if (! isset ( $this->putHead ['Refer'] )) { $this->putHead ['Refer'] = ($referer == '') ? 'http://' . $host : $referer; } foreach ( $this->putHead as $headname => $headvalue ) { $output .= trim ( $headname ) . ': .trim($headvalue)."\r\n"'; } $output .= "Connection: close\r\n\r\n"; fwrite ( $fsp, $output ); $content = ''; while ( ! feof ( $fsp ) ) { $content .= fgets ( $fsp, 256 ); } fclose ( $fsp ); $this->getHead ( $content ); $this->httpContent = $content; if (strtoupper ( $this->charset ) != 'UTF-8') { $content = iconv ( $this->charset, 'utf-8', $content ); } else if (! empty ( $this->httpHead ['charset'] ) && $this->httpHead ['charset'] != 'UTF-8') { $content = iconv ( $this->httpHead ['charset'], 'utf-8', $content ); } $this->httpreferer = $referer; return $content; } /** * 按照规则从内容提取所有字段 * * @param Array * @return Array */ function filterContent($content = '') { $rs = array (); foreach ( $this->field_arr as $field => $fieldinfo ) { $rs [$field] = $this->getPregField ( $fieldinfo, $content ); } $this->result [] = $rs; } /** * 相对路径转化为绝对路径 * * @param string $relative * @param string $referer * @return string */ function urlRtoA($relative, $referer) { /** * 去除#后面的部分 */ $pos = strpos ( $relative, '#' ); if ($pos > 0) $relative = substr ( $relative, 0, $pos ); /** * 检测路径如果是绝对地址直接返回 */ if (preg_match ( "~^(http|ftp)://~i", $relative )) return $relative; /** * 解析引用地址,获得协议,主机等信息 */ preg_match ( "~((http|ftp)://([^/]*)(.*/))([^/#]*)~i", $referer, $preg_rs ); $parentdir = $preg_rs [1]; $petrol = $preg_rs [2] . '://'; $host = $preg_rs [3]; /** * 如果以/开头的情况 */ if (preg_match ( "~^/~i", $relative )) return $petrol . $host . $relative; return $parentdir . $relative; } /** * 根据规则提取一个字段 * * @param string $pattern * @param string $content * @return string */ function getPregField($fieldinfo,$content) { /** * 规则为固定值的情况,直接返回固定值 */ if(strpos($fieldinfo['pattern'],'{'.$fieldinfo['field'].'}') === false) return $fieldinfo['pattern']; if($fieldinfo['isregular'] == 'true'){ $pattern = $fieldinfo['pattern']; $pattern = str_replace('{'.$fieldinfo['field'].'}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); }else{ $pattern = preg_quote($fieldinfo['pattern']); $pattern = str_replace('\{'.$fieldinfo['field'].'\}','(?P<'.$fieldinfo['field'].'>.*?)',$pattern); } $pattern = "~".$pattern."~is"; preg_match($pattern,$content,$preg_rs); $fieldresult = $preg_rs[$fieldinfo['field']]; /** * 去掉换行符 */ $fieldresult = preg_replace("~[\r\n]*~is",'',$fieldresult); /** * 对采集到的结果根据规则再进行二次替换处理 */ $replace_arr = $fieldinfo['replace']; if(is_array($replace_arr)){ $replace_arr[0] = "~".$replace_arr[0]."~s"; $fieldresult = preg_replace($replace_arr[0],$replace_arr[1],$fieldresult); } /** * 针对有下一页的字段递归采集 */ if($this->pagelimit == 0){ if($fieldinfo['nextpage'] != ''){ $pattern = $fieldinfo['nextpage']; $pattern = str_replace('{nextpage}','(?P[^\'\">]*?)',$pattern); $pattern = "~".$pattern."~is"; if(preg_match($pattern,$content,$preg_rs) && $preg_rs['nextpage'] != ''){ $fieldresult .= $this->getPregField($fieldinfo,$this->getContent($preg_rs['nextpage'],$this->httpreferer)); } } } if(!empty($fieldinfo['callback']))$fieldresult = $fieldinfo['callback']($fieldresult); return $fieldresult; } /** * 添加一个采集字段和规则 * * @param string $field * @param string $pattern */ function addField($field,$pattern,$replace_arr='',$isregular='false',$nextpage = '',$callback='') { $rs = array( 'field' => $field, 'pattern' => $pattern, 'replace' => $replace_arr, 'isregular' => $isregular, 'nextpage' => $nextpage, 'callback'=>$callback ); $this->field_arr[$field] =$rs; } /** * 输出 */ function output() { echo "The result is:<pre>"; echo "runtime :$this->runtime S"; print_r( $this->result ); echo "</pre>"; } /** * 输出到XLS文件 * * @param string $file */ function saveXls($file = 'spider_result.xls') { $fp = fopen ( $file, 'w' ); if ($fp) { foreach ( $this->result as $result ) { $line = implode ( "\t", $result ) . "\n"; fputs ( $fp, $line ); } } fclose ( $fp ); echo 'The result has been saved to ' . $file . '. Cost time:' . $this->runtime; } function saveSql($table = 'spider_result', $file = 'spider_result.sql') { $fp = fopen ( $file, 'w' ); if ($fp) { foreach ( $this->field_arr as $fieldinfo ) { $sql_key .= ', `' . $fieldinfo ['field'] . '`'; } $sql_key = substr ( $sql_key, 1 ); foreach ( $this->result as $result ) { $sql_value = array (); foreach ( $result as $key => $value ) { $sql_value [] = "'" . $this->addslash ( $value ) . "'"; } $line = "INSERT INTO `$table` ( $sql_key ) VALUES (" . join ( ', ', $sql_value ) . ");\r\n"; fputs ( $fp, $line ); } } fclose ( $fp ); echo 'The result has been saved to ' . $file . '. Cost time:' . $this->runtime; } /** * 取得响应内容的头部信息 * * @param string $content * @return array */ function getHead($content) { $head = explode("\r\n\r\n",$content); $head = $head[0]; // echo $head; if(!preg_match("~charset\=(.*)\r\n~i",$head,$preg_rs)) preg_match('~charset=([^\"\']*)~i',$content,$preg_rs); $this->httpHead['charset'] = strtoupper(trim($preg_rs[1])); // preg_match("~charset\=(.*)~i",$head,$preg_rs); return $this->httpHead; } /** * 设置采集页面的编码 * 在程序不能自动识别的情况下采集前要手动调用此函数 * * @param string $charset */ function setCharset($charset){ $this->charset = strtoupper($charset); } /** * 设置第一层链接页面地址 * * @param array $url_arr */ function setStartUrls($url_arr) { $this->startUrls = $url_arr; } /** * 增加一个第一层链接页面地址 * * @param string $url */ function addStartUrl($url) { $this->startUrls[] = $url; } /** * 添加一个采集层次 * * @param integer $deep * @param string $layout * @param boolean $isSimple * @param boolean $isPageBreak * @param string $pattern */ function addLayer($deep,$layout,$pattern = '',$isSimple = 'false',$isPageBreak = 'false') { $this->layout_arr[$deep] = array( 'layout'=>$layout, 'isSimple'=>$isSimple, 'isPageBreak'=>$isPageBreak, 'pattern'=>$pattern ); } /** * 自定义head * * @param string $namespace * @param string $value */ function setHead($name, $value) { $this->putHead [$name] = $value; } /** * 清除html代码 * * @param string $content; * @param string $cleartags * @return string */ function clearHtml($content, $cleartags = 'div') { $cleartags_arr = explode ( '|', $cleartags ); foreach ( $cleartags_arr as $cleartag ) { $pattern = '~<\/?' . $cleartag . '[^>]*>~is'; $content = preg_replace ( $pattern, '', $content ); } return $content; } /** * 日志 */ function log($str) { echo $str . "\n"; } /** * 获取采集运行时间 * * @return float */ function getRuntime() { return $this->runtime; } function microtime_float() { list ( $usec, $sec ) = explode ( " ", microtime () ); return (( float ) $usec + ( float ) $sec); } function addslash($string) { return addslashes ( $string ); } } ?>

转载于:https://my.oschina.net/u/137226/blog/132474

### OpenTelemetry Collector 配置和使用指南 #### 1. OpenTelemetry Collector 基础概念 OpenTelemetry Collector 是一个灵活的数据收集工具,用于接收、处理并导出遥测数据(如指标、跟踪和日志)。它支持多种协议作为输入源,并能将数据发送至不同的后端存储系统。其核心功能包括接收器 (Receiver)、处理器 (Processor) 和导出器 (Exporter)[^2]。 #### 2. 安装 OpenTelemetry Collector 可以通过官方发布的二进制文件来安装 OpenTelemetry Collector。具体步骤如下: - 下载对应版本的二进制包: ```bash wget https://github.com/open-telemetry/opentelemetry-collector/releases/download/v0.87.0/otelcol_linux_amd64.zip unzip otelcol_linux_amd64.zip chmod +x ./otelcol ``` - 或者通过 Docker 运行容器化版本: ```dockerfile docker run --name otel-collector -p 4317:4317 -p 8889:8889 -v $(pwd)/config.yaml:/etc/otelcol/config.yaml ghcr.io/open-telemetry/opentelemetry-collector-contrib:latest ``` 此命令会启动一个带有自定义配置文件 `config.yaml` 的 OpenTelemetry Collector 实例[^4]。 #### 3. 配置文件详解 OpenTelemetry Collector 使用 YAML 文件进行配置。以下是典型的配置结构及其组成部分说明: - **Receivers**: 数据入口模块,负责监听来自不同来源的数据流。 - **Processors**: 对接收到的数据执行预处理操作,例如过滤、采样或增强元数据。 - **Exporters**: 将经过处理后的数据转发给目标后端服务。 - **Service Pipelines**: 描述完整的管道逻辑链路,连接 Receivers, Processors 和 Exporters。 下面是一个简单的例子展示如何设置 OTLP 接收器并将数据推送到 Prometheus 后端: ```yaml receivers: otlp: protocols: grpc: exporters: prometheus: endpoint: "http://localhost:9090" processors: batch: service: pipelines: metrics: receivers: [otlp] processors: [batch] exporters: [prometheus] ``` 该配置表示启用了 gRPC 协议下的 OTLP 收集接口,并且所有的度量标准都会被批量打包并通过 Promethues 导出器提交到本地运行的服务实例上[^3]。 #### 4. 日志迁移实践案例 对于希望从 Logstash 切换过来的团队来说,《Logstash Pipeline Migration Guide》提供了一个清晰路径图解法指导他们完成这一过程。主要涉及识别现有插件映射关系以及调整语法适配新框架需求等方面的工作内容[^5]。 ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值