Apache module杂记-优快云博客

本文探讨了在Apache环境下使用mod_line_edit模块基于正则表达式替换HTML内容的经验，包括性能问题分析及优化策略。重点介绍了如何在特定HTML标签后插入元信息，并通过环境变量动态调整。

[size=large]可以基于正则表达式修改文本内容的apache module:[/size]
[b]mod_sed:[/b]实现了类似sed功能的module,可以通过正则表达式修改文本内容。apache2.3中加入了这个module,但是这个module也可以用于apache 2.0版本。可以到[url=http://src.opensolaris.org/source/xref/webstack/mod_sed/]http://src.opensolaris.org/source/xref/webstack/mod_sed/[/url]下载源代码,readme里有相应的编译命令：/http安装路径/bin/apxs -i -c mod_sed.c regexp.c sed0.c sed1.c

[b]mod_substitute:[/b]功能和mod_sed类似，默认加入到apache2.2中[url=http://httpd.apache.org/docs/2.2/mod/mod_substitute.html]http://httpd.apache.org/docs/2.2/mod/mod_substitute.html[/url]

[b]mod_line_edit:[/b]也可以基于正则表达式替换文本内容，可以就该html/css/javascript。但是它和前两者不同的是mod_line_edit的to-pattern可以使用apache的环境变量[url=http://apache.webthing.com/mod_line_edit/]http://apache.webthing.com/mod_line_edit/[/url]，这个功能正是我最近需要的。
例如下面的配置可以在<head>标签后插入一个<meta/>标签并且可以将环境变量unique_id的值添加到meta的属性中去（unique_id环境变量需要mod_unique_id的支持[url=http://lamp.linux.gov.cn/apache/apachemenu/mod/mod_unique_id.html]http://lamp.linux.gov.cn/apache/apachemenu/mod/mod_unique_id.html[/url]）

lerewriterule "<head>" "<head><meta http-equiv='request-id' content='${unique_id}' />" iv

[b]mod_proxy_html:[/b]可以基于html标签进行比较比较精细的内容修改操作[url=http://apache.webthing.com/mod_proxy_html/]http://apache.webthing.com/mod_proxy_html/[/url]

以上这些module都是基于apache的过滤器的功能来完成对相应内容的修改（类似servlet里的filter）[url=http://lamp.linux.gov.cn/apache/apachemenu/filter.html]http://lamp.linux.gov.cn/apache/apachemenu/filter.html[/url]
[img]http://lamp.linux.gov.cn/apache/apachemenu/images/filter_arch.gif[/img]

[size=large]mod_line_edit使用经验:[/size]
[b]1.mod_line_edit的性能问题：[/b]考虑到系统中有500-700k的html页面，因此我就对mod_line_edit在展现1m的静态html做了一下压力测试。测试结果非常不理想，在不添加mod_line_edit时tps在500+，加上mod_line_edit后tps只有1(狂汗...)。
[b]2.mod_line_edit的代码分析：[/b]

  /* mod_line_edit顾名思义就是对文本内容按行进行编辑，   * 因此mod要先对输出流进行整理，每一行内容收集到一个apr_bucket里，   * 然后将所有行数据放到bbline中，下面的代码实现的就是这个用途。   */  bbline = apr_brigade_create(f->r->pool, f->c->bucket_alloc) ;  /* first ensure we have no mid-line breaks that might be in the   * middle of a search string causing us to miss it!  at the same   * time we split into lines to avoid pattern-matching over big   * chunks of memory.   */  while ( b != apr_brigade_sentinel(bb) ) {    if ( !apr_bucket_is_metadata(b) ) {      if ( apr_bucket_read(b, &buf, &bytes, apr_block_read) == apr_success ) {	if ( bytes == 0 ) {	  apr_bucket_remove(b) ;	} else while ( bytes > 0 ) {	  switch (cfg->lineend) {	  case lineend_unix:	    le = memchr(buf, '\n', bytes) ;	    break ;	  case lineend_mac:	    le = memchr(buf, '\r', bytes) ;	    break ;	  case lineend_dos:	    /* edge-case issue: if a \r\n spans buckets it'll get missed.	     * not a problem for present purposes, but would be an issue	     * if we claimed to support pattern matching on the lineends.	     */	    found = 0 ;	    le = memchr(buf+1, '\n', bytes-1) ;	    while ( le && !found ) {	      if ( le[-1] == '\r' ) {	        found = 1 ;	      } else {	        le = memchr(le+1, '\n', bytes-1 - (le+1 - buf)) ;	      }	    }	    if ( !found )	      le = 0 ;	    break;	  case lineend_any:	  case lineend_unset:	    /* edge-case notabug: if a \r\n spans buckets it'll get seen as	     * two line-ends.  it'll insert the \n as a one-byte bucket.	     */	    le_n = memchr(buf, '\n', bytes) ;	    le_r = memchr(buf, '\r', bytes) ;	    if ( le_n != null )	      if ( le_n == le_r + sizeof(char))	        le = le_n ;	      else if ( (le_r < le_n) && (le_r != null) )	        le = le_r ;	      else	        le = le_n ;	    else	      le = le_r ;	    break;	  case lineend_none:	    le = 0 ;	    break;	  case lineend_custom:	    le = memchr(buf, cfg->lechar, bytes) ;	    break;	  }	  if ( le ) {	    /* found a lineend in this bucket. */	    offs = 1 + ((unsigned int)le-(unsigned int)buf) / sizeof(char) ;	    apr_bucket_split(b, offs) ;	    bytes -= offs ;	    buf += offs ;	    b1 = apr_bucket_next(b) ;	    apr_bucket_remove(b);	    /* is there any previous unterminated content ? */	    if ( !apr_brigade_empty(ctx->bbsave) ) {	      /* append this to any content waiting for a lineend */	      apr_brigade_insert_tail(ctx->bbsave, b) ;	      rv = apr_brigade_pflatten(ctx->bbsave, &fbuf, &fbytes, f->r->pool) ;	      /* make b a new bucket of the flattened stuff */	      b = apr_bucket_pool_create(fbuf, fbytes, f->r->pool,			f->r->connection->bucket_alloc) ;	      /* bbsave has been consumed, so clear it */	      apr_brigade_cleanup(ctx->bbsave) ;	    }	    /* b now contains exactly one line */	    apr_brigade_insert_tail(bbline, b);	    b = b1 ;	  } else {	    /* no lineend found.  remember the dangling content */	    apr_bucket_remove(b);	    apr_brigade_insert_tail(ctx->bbsave, b);	    bytes = 0 ;	  }	} /* while bytes > 0 */      } else {	/* bucket read failed - oops !  let's remove it. */	apr_bucket_remove(b);      }    } else if ( apr_bucket_is_eos(b) ) {      /* if there's data to pass, send it in one bucket */      if ( !apr_brigade_empty(ctx->bbsave) ) {        rv = apr_brigade_pflatten(ctx->bbsave, &fbuf, &fbytes, f->r->pool) ;        b1 = apr_bucket_pool_create(fbuf, fbytes, f->r->pool,		f->r->connection->bucket_alloc) ;        apr_brigade_insert_tail(bbline, b1);      }      apr_brigade_cleanup(ctx->bbsave) ;      /* start again rather than segfault if a seriously buggy       * filter in front of us sent a bogus eos       */      f->ctx = null ;      /* move the eos to the new brigade */      apr_bucket_remove(b);      apr_brigade_insert_tail(bbline, b);    } else {      /* chop flush or unknown metadata bucket types */      apr_bucket_delete(b);    }    /* ok, reset pointer to what's left (since we're not in a for-loop) */    b = apr_brigade_first(bb) ;  }

  /* 这里就是循环使用配置的规则处理整理好的每行apr_bucket   */  for (i = 0; i < ctx->rewriterules->nelts; ++i) {    for ( b = apr_brigade_first(bbline) ;	b != apr_brigade_sentinel(bbline) ;	b = apr_bucket_next(b) ) {      if ( !apr_bucket_is_metadata(b)	&& (apr_bucket_read(b, &buf, &bytes, apr_block_read) == apr_success)) {	if ( rules[i].flags & m_regex ) {	  bufp = apr_pstrmemdup(ctx->lpool, buf, bytes) ;	  while ( ! ap_regexec(rules[i].from.r, bufp, nmatch, pmatch, 0) ) {	    match = pmatch[0].rm_so ;	    subs = ap_pregsub(f->r->pool, rules[i].to, bufp, nmatch, pmatch) ;	    apr_bucket_split(b, match) ;	    b1 = apr_bucket_next(b) ;	    apr_bucket_split(b1, pmatch[0].rm_eo - match) ;	    b = apr_bucket_next(b1) ;	    apr_bucket_delete(b1) ;	    b1 = apr_bucket_pool_create(subs, strlen(subs), f->r->pool,		  f->r->connection->bucket_alloc) ;	    apr_bucket_insert_before(b, b1) ;	    bufp += pmatch[0].rm_eo ;	  }	} else {	  bufp = buf ;	  while (subs = apr_strmatch(rules[i].from.s, bufp, bytes),			subs != null) {	    match = ((unsigned int)subs - (unsigned int)bufp) / sizeof(char) ;	    bytes -= match ;	    bufp += match ;	    apr_bucket_split(b, match) ;	    b1 = apr_bucket_next(b) ;	    apr_bucket_split(b1, rules[i].length) ;	    b = apr_bucket_next(b1) ;	    apr_bucket_delete(b1) ;	    bytes -= rules[i].length ;	    bufp += rules[i].length ;	    b1 = apr_bucket_immortal_create(rules[i].to, strlen(rules[i].to),		f->r->connection->bucket_alloc) ;	    apr_bucket_insert_before(b, b1) ;	  }	}      }    }    /* if we used a local pool, clear it now */    if ( (ctx->lpool != f->r->pool) && (rules[i].flags & m_regex) ) {      apr_pool_clear(ctx->lpool) ;    }  }

正因为是这个filter对所有输出流会进行遍历、整理、拷贝，然后又将整理好的流按行进行处理。这样如果输出大文本势必会影响性能。
[b]3.mod_line_edit的优化：[/b]针对我对mod_line_edit的需求比较简单，只是对<head>标记后面追加一些内容。所以没有必要对整个输出流进行遍历。只要对输出流中<head>标签处理以后就可以结束对输出流的处理，直接调用