如何为Postgresql数据库全文搜索(full text search)编写解析器(parser)

本文介绍如何为PostgreSQL全文搜索创建一个处理CamelCase字符串的解析器。该解析器适用于PostgreSQL 8.3.9版本,并在Ubuntu 8.10上进行了测试。文章详细展示了如何从头开始构建解析器,包括编写C代码、创建Makefile以及安装配置步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

用英文写的,主要是命令和代码,就不翻译了,偷懒一下。

This article show you how to create a parser to handle camel case [3] in string in Postgresql full text search. This parser is tested on Postgresql 8.3.9 version. The OS is ubuntu 8.10.

[1,2] will give you basic background and sample for old version. I update the code to fit 8.3.9 version, especial lots of changes in the SQL statement.

Create parser

First, create a directory for put all files, for example: /home/user/Desktop/test_parser

Create file "test_parser.c". I use vim.

#include "postgres.h"

#include "utils/builtins.h"





#ifdef PG_MODULE_MAGIC

PG_MODULE_MAGIC;

#endif



/*

* types

*/



/* self-defined type */

typedef struct {

char * buffer; /* text to parse */

int len; /* length of the text in buffer */

int pos; /* position of the parser */

} ParserState;



/* copy-paste from wparser.h of tsearch2 */

typedef struct {

int lexid;

char *alias;

char *descr;

} LexDescr;



/*

* prototypes

*/

PG_FUNCTION_INFO_V1(testprs_start);

Datum testprs_start(PG_FUNCTION_ARGS);



PG_FUNCTION_INFO_V1(testprs_getlexeme);

Datum testprs_getlexeme(PG_FUNCTION_ARGS);



PG_FUNCTION_INFO_V1(testprs_end);

Datum testprs_end(PG_FUNCTION_ARGS);



PG_FUNCTION_INFO_V1(testprs_lextype);

Datum testprs_lextype(PG_FUNCTION_ARGS);



/*

* functions

*/

Datum testprs_start
(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) palloc(sizeof(ParserState));

pst->buffer = (char *) PG_GETARG_POINTER(0);

pst->len = PG_GETARG_INT32(1);

pst->pos = 0;

PG_RETURN_POINTER(pst);

}



Datum testprs_getlexeme
(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

char **t = (char **) PG_GETARG_POINTER(1);

int *tlen = (int *) PG_GETARG_POINTER(2);

int type;



*tlen = pst->pos;

*t = pst->buffer + pst->pos;



HTML clipboard/* main process here */
if (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)) {
/* word type */
type = 3;
(pst->pos)++;
/* for case like: ItemURL => Item URL*/
if(((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)){
while (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos < pst->len)) {
(pst->pos)++;
}
}

/* go to the next upper case character */
while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
(pst->pos)++;
}
}

if (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
/* word type */
type = 3;
(pst->pos)++;
/* go to the next upper case character */
while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') && (pst->pos < pst->len)) {
(pst->pos)++;
}
}



*tlen = pst->pos - *tlen;



/* we are finished if (*tlen == 0) */

if (*tlen == 0) type=0;

PG_RETURN_INT32(type);

}



Datum testprs_end
(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

pfree(pst);

PG_RETURN_VOID();

}



Datum testprs_lextype
(PG_FUNCTION_ARGS)

{

/*

Remarks:

- we have to return the blanks for headline reason

- we use the same lexids like Teodor in the default

word parser; in this way we can reuse the headline

function of the default word parser.

*/

LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));



/* there are only two types in this parser */

descr[0].lexid = 3;

descr[0].alias = pstrdup("word");

descr[0].descr = pstrdup("Word");

descr[1].lexid = 12;

descr[1].alias = pstrdup("blank");

descr[1].descr = pstrdup("Space symbols");

descr[2].lexid = 0;

PG_RETURN_POINTER(descr);

}

Create "Makefile" file under the same directory.

override CPPFLAGS := -I. $(CPPFLAGS)
MODULE_big = test_parser
OBJS = test_parser.o

DATA_built = test_parser.sql
DATA =
REGRESS = test_parser

ifdef USE_PGXS
PGXS := $(shell pg_config --pgxs)
include $(PGXS)
else
subdir = contrib/test_parser
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

Create "test_parser.sql" file under the same directory.

SET search_path = public;
BEGIN;

--DROP TEXT SEARCH CONFIGURATION testcfg;
--DROP TEXT SEARCH PARSER testparser;

CREATE OR REPLACE FUNCTION testprs_start(internal,int4)
RETURNS internal
AS '$libdir/test_parser'
LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_getlexeme(internal,internal,internal)
RETURNS internal
AS '$libdir/test_parser'
LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_end(internal)
RETURNS void
AS '$libdir/test_parser'
LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_lextype(internal)
RETURNS internal
AS '$libdir/test_parser'
LANGUAGE 'C';

CREATE TEXT SEARCH PARSER testparser(
START = 'testprs_start',
GETTOKEN = 'testprs_getlexeme',
END = 'testprs_end',
LEXTYPES = 'testprs_lextype'
);

CREATE TEXT SEARCH CONFIGURATION testcfg (
PARSER ='testparser'
);

ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH english_stem;


END;

Note: the bold part is total changed for the 8.3.9 version. "DROP TEXT SEARCH " is very useful if you make modification and load the script again.

Install the parser:

  • open a terminal, switch to the directory you just create. For example: cd /home/user/Desktop/test_parser
  • type: sudo PATH=/usr/local/pgsql/bin:$PATH USE_PGXS=1 make install
  • switch to user "prostgres", type: /usr/local/pgsql/bin/psql -f /usr/local/pgsql/share/contrib/test_parser.sql your-database
    SET
    BEGIN
    CREATE FUNCTION
    CREATE FUNCTION
    CREATE FUNCTION
    CREATE FUNCTION
    DROP TEXT SEARCH CONFIGURATION
    DROP TEXT SEARCH PARSER
    CREATE TEXT SEARCH PARSER
    CREATE TEXT SEARCH CONFIGURATION
    ALTER TEXT SEARCH CONFIGURATION
    COMMIT
  • type: /usr/local/pgsql/bin/psql postgres
  • test the parser:
    postgres=# SELECT to_tsvector('testcfg','itemValue');
    to_tsvector
    --------------------
    'item':1 'value':2
    (1 row)

    postgres=# SELECT to_tsvector('testcfg','ItemValue');
    to_tsvector
    --------------------
    'item':1 'value':2
    (1 row)

    postgres=# SELECT to_tsquery('testcfg','itemsValue');
    to_tsquery
    -------------------
    'items' & 'value'
    (1 row)

Reference:
[1] http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html
[2] http://www.pgcon.org/2007/schedule/attachments/12-fts.pdf
[3] http://en.wikipedia.org/wiki/CamelCase
[4] http://www.sai.msu.su/~megera/postgres/fts/doc/sql-fts-createmap.html
[5] http://www.postgresql.org/docs/8.3/static/textsearch-configuration.html
[6] http://developer.postgresql.org/pgdocs/postgres/sql-createtsconfig.html
[7] http://www.postgresql.org/docs/7.4/interactive/sql-createfunction.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值