| PostgreSQL 8.3beta1 Documentation | ||||
|---|---|---|---|---|
| Prev | Fast Backward | Chapter 12. Full Text Search | Fast Forward | Next |
SQL command CREATE TEXT SEARCH PARSER creates
a parser for full text searching. In our example we will implement
a simple parser which recognizes space-delimited words and
has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
were chosen to keep compatibility with the default headline() function
since we do not implement our own version.
To implement a parser one needs to create a minimum of four functions.
START = start_function
Initialize the parser. Arguments are a pointer to the parsed text and its length.
Returns a pointer to the internal structure of a parser. Note that it should
be malloced or palloced in the
TopMemoryContext. We name it ParserState.
GETTOKEN = gettoken_function
Returns the next token. Arguments are ParserState *, char **, int *.
This procedure will be called as long as the procedure returns token type zero.
END = end_function,
This void function will be called after parsing is finished to free allocated resources in this procedure (ParserState). The argument is ParserState *.
LEXTYPES = lextypes_function
Returns an array containing the id, alias, and the description of the tokens in the parser. See LexDescr in src/include/utils/ts_public.h.
Below is the source code of our test parser, organized as a contrib module.
Testing:
SELECT * FROM ts_parse('testparser','That''s my first own parser');
tokid | token
-------+--------
3 | That's
12 |
3 | my
12 |
3 | first
12 |
3 | own
12 |
3 | parser
SELECT to_tsvector('testcfg','That''s my first own parser');
to_tsvector
-------------------------------------------------
'my':2 'own':4 'first':3 'parser':5 'that''s':1
SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
headline
-----------------------------------------------------------------
Supernovae <b>stars</b> are the brightest phenomena in galaxies
This test parser is an example adopted from a tutorial by Valli, parser HOWTO.
To compile the example just do:
$ make $ make install $ psql regression < test_parser.sql
This is a test_parser.c:
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
/*
* types
*/
/* self-defined type */
typedef struct {
char * buffer; /* text to parse */
int len; /* length of the text in buffer */
int pos; /* position of the parser */
} ParserState;
/* copy-paste from wparser.h of tsearch2 */
typedef struct {
int lexid;
char *alias;
char *descr;
} LexDescr;
/*
* prototypes
*/
PG_FUNCTION_INFO_V1(testprs_start);
Datum testprs_start(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_getlexeme);
Datum testprs_getlexeme(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_end);
Datum testprs_end(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_lextype);
Datum testprs_lextype(PG_FUNCTION_ARGS);
/*
* functions
*/
Datum testprs_start(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
pst->buffer = (char *) PG_GETARG_POINTER(0);
pst->len = PG_GETARG_INT32(1);
pst->pos = 0;
PG_RETURN_POINTER(pst);
}
Datum testprs_getlexeme(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
int type;
*tlen = pst->pos;
*t = pst->buffer + pst->pos;
if ((pst->buffer)[pst->pos] == ' ')
{
/* blank type */
type = 12;
/* go to the next non-white-space character */
while ((pst->buffer)[pst->pos] == ' ' &&
pst->pos < pst->len)
(pst->pos)++;
} else {
/* word type */
type = 3;
/* go to the next white-space character */
while ((pst->buffer)[pst->pos] != ' ' &&
pst->pos < pst->len)
(pst->pos)++;
}
*tlen = pst->pos - *tlen;
/* we are finished if (*tlen == 0) */
if (*tlen == 0)
type=0;
PG_RETURN_INT32(type);
}
Datum testprs_end(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
pfree(pst);
PG_RETURN_VOID();
}
Datum testprs_lextype(PG_FUNCTION_ARGS)
{
/*
Remarks:
- we have to return the blanks for headline reason
- we use the same lexids like Teodor in the default
word parser; in this way we can reuse the headline
function of the default word parser.
*/
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
/* there are only two types in this parser */
descr[0].lexid = 3;
descr[0].alias = pstrdup("word");
descr[0].descr = pstrdup("Word");
descr[1].lexid = 12;
descr[1].alias = pstrdup("blank");
descr[1].descr = pstrdup("Space symbols");
descr[2].lexid = 0;
PG_RETURN_POINTER(descr);
}
This is a Makefile
override CPPFLAGS := -I. $(CPPFLAGS) MODULE_big = test_parser OBJS = test_parser.o DATA_built = test_parser.sql DATA = DOCS = README.test_parser REGRESS = test_parser ifdef USE_PGXS PGXS := $(shell pg_config --pgxs) include $(PGXS) else subdir = contrib/test_parser top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif
This is a test_parser.sql.in:
SET default_text_search_config = 'english';
BEGIN;
CREATE FUNCTION testprs_start(internal,int4)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE FUNCTION testprs_end(internal)
RETURNS void
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE FUNCTION testprs_lextype(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE 'C' with (isstrict);
CREATE TEXT SEARCH PARSER testparser (
START = testprs_start,
GETTOKEN = testprs_getlexeme,
END = testprs_end,
LEXTYPES = testprs_lextype
);
CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
END;