diff --git a/contrib/fulltextindex/README.fti b/contrib/fulltextindex/README.fti index 10838db6912641d129643c6abff8025551523d1b..5c3d6362a6b49ee88f3127b07fa306dfface9024 100644 --- a/contrib/fulltextindex/README.fti +++ b/contrib/fulltextindex/README.fti @@ -3,11 +3,13 @@ An attempt at some sort of Full Text Indexing for PostgreSQL. The included software is an attempt to add some sort of Full Text Indexing support to PostgreSQL. I mean by this that we can ask questions like: - Give me all rows that have 'still' and 'nash' in the 'artist' field. + Give me all rows that have 'still' and 'nash' in the 'artist' or 'title' + fields. Ofcourse we can write this as: - select * from cds where artist ~* 'stills' and artist ~* 'nash'; + select * from cds where (artist ~* 'stills' or title ~* 'stills') and + (artist ~* 'nash' or title ~* 'nash'); But this does not use any indices, and therefore, if your database gets very large, it will not have very high performance (the above query @@ -15,8 +17,8 @@ requires at least one sequential scan, it probably takes 2 due to the self-join). The approach used by this add-on is to define a trigger on the table and -column you want to do this queries on. On every insert in the table, it -takes the value in the specified column, breaks the text in this column +columns you want to do this queries on. On every insert in the table, it +takes the value in the specified columns, breaks the text in these columns up into pieces, and stores all sub-strings into another table, together with a reference to the row in the original table that contained this sub-string (it uses the oid of that row). @@ -24,8 +26,8 @@ sub-string (it uses the oid of that row). By now creating an index over the 'fti-table', we can search for substrings that occur in the original table. By making a join between the fti-table and the orig-table, we can get the actual rows we want -(this can also be done by using subselects, and maybe there're other -ways too). +(this can also be done by using subselects - but subselects are currently +inefficient in Postgres, and maybe there're other ways too). The trigger code also allows an array called StopWords, that prevents certain words from being indexed. @@ -62,20 +64,22 @@ The create the function that contains the trigger:: And finally define the trigger on the 'cds' table: create trigger cds-fti-trigger after update or insert or delete on cds - for each row execute procedure fti(cds-fti, artist); + for each row execute procedure fti(cds-fti, artist, title); Here, the trigger will be defined on table 'cds', it will create -sub-strings from the field 'artist', and it will place those sub-strings -in the table 'cds-fti'. +sub-strings from the fields 'artist' and 'title', and it will place +those sub-strings in the table 'cds-fti'. Now populate the table 'cds'. This will also populate the table 'cds-fti'. -It's fastest to populate the table *before* you create the indices. +It's fastest to populate the table *before* you create the indices. Use the +supplied 'fti.pl' to assist you with this. Before you start using the system, you should at least have the following indices: - create index cds-fti-idx on cds-fti (string, id); - create index cds-oid-idx on cds (oid); + create index cds-fti-idx on cds-fti (string); -- String matching + create index cds-fti-idx on cds-fti (id); -- For deleting a cds row + create index cds-oid-idx on cds (oid); -- For joining cds to cds-fti To get the most performance out of this, you should have 'cds-fti' clustered on disk, ie. all rows with the same sub-strings should be @@ -109,7 +113,7 @@ clustered : same as above, only clustered : 4.501.321 rows A sequential scan of the artist_fti table (and thus also the clustered table) takes around 6:16 minutes.... -Unfortunately I cannot probide anybody else with this test-date, since I +Unfortunately I cannot provide anybody else with this test-data, since I am not allowed to redistribute the data (it's a database being sold by a couple of wholesale companies). Anyways, it's megabytes, so you probably wouldn't want it in this distribution anyways. diff --git a/contrib/fulltextindex/fti.c b/contrib/fulltextindex/fti.c index a797505e7016dbb56b36d90fc99fe769568bdf82..10e40fd5dd2a3801c5212dbe5822f8e2bd875a17 100644 --- a/contrib/fulltextindex/fti.c +++ b/contrib/fulltextindex/fti.c @@ -6,61 +6,87 @@ #include "commands/trigger.h" /* - * Trigger function takes 2 arguments: - 1. relation in which to store the substrings - 2. field to extract substrings from - - The relation in which to insert *must* have the following layout: - - string varchar(#) - id oid - - Example: - -create function fti() returns opaque as -'/home/boekhold/src/postgresql-6.2/contrib/fti/fti.so' language 'C'; - -create table title_fti (string varchar(25), id oid); -create index title_fti_idx on title_fti (string); - -create trigger title_fti_trigger after update or insert or delete on product -for each row execute procedure fti(title_fti, title); - ^^^^^^^^^ - where to store index in - ^^^^^ - which column to index - -ofcourse don't forget to create an index on title_idx, column string, else -you won't notice much speedup :) - -After populating 'product', try something like: - -select p.* from product p, title_fti f1, title_fti f2 where - f1.string='slippery' and f2.string='wet' and f1.id=f2.id and p.oid=f1.id; -*/ - -/* - march 4 1998 Changed breakup() to return less substrings. Only breakup - in word parts which are in turn shortened from the start - of the word (ie. word, ord, rd) - Did allocation of substring buffer outside of breakup() - oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha - characters between words then 1). - - oct 4-5 1997 implemented the thing, at least the basic functionallity - of it all.... -*/ - -/* IMPROVEMENTS: - - save a plan for deletes - create a function that will make the index *after* we have populated - the main table (probably first delete all contents to be sure there's - nothing in it, then re-populate the fti-table) - - can we do something with operator overloading or a seperate function - that can build the final query automatigally? - */ + * Trigger function accepts variable number of arguments: + * + * 1. relation in which to store the substrings + * 2. fields to extract substrings from + * + * The relation in which to insert *must* have the following layout: + * + * string varchar(#) + * id oid + * + * where # is the largest size of the varchar columns being indexed + * + * Example: + * + * -- Create the SQL function based on the compiled shared object + * create function fti() returns opaque as + * '/usr/local/pgsql/lib/contrib/fti.so' language 'C'; + * + * -- Create the FTI table + * create table product_fti (string varchar(255), id oid); + * + * -- Create an index to assist string matches + * create index product_fti_string_idx on product_fti (string); + * + * -- Create an index to assist trigger'd deletes + * create index product_fti_id_idx on product_fti (id); + * + * -- Create an index on the product oid column to assist joins + * -- between the fti table and the product table + * create index product_oid_idx on product (oid); + * + * -- Create the trigger to perform incremental changes to the full text index. + * create trigger product_fti_trig after update or insert or delete on product + * for each row execute procedure fti(product_fti, title, artist); + * ^^^^^^^^^^^ + * table where full text index is stored + * ^^^^^^^^^^^^^ + * columns to index in the base table + * + * After populating 'product', try something like: + * + * SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE + * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND p.oid=f2.id; + * + * To check that your indicies are being used correctly, make sure you + * EXPLAIN SELECT ... your test query above. + * + * CHANGELOG + * --------- + * + * august 3 2001 + * Extended fti function to accept more than one column as a + * parameter and all specified columns are indexed. Changed + * all uses of sprintf to snprintf. Made error messages more + * consistent. + * + * march 4 1998 Changed breakup() to return less substrings. Only breakup + * in word parts which are in turn shortened from the start + * of the word (ie. word, ord, rd) + * Did allocation of substring buffer outside of breakup() + * + * oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha + * characters between words then 1). + * + * oct 4-5 1997 implemented the thing, at least the basic functionallity + * of it all.... + * + * TODO + * ---- + * + * prevent generating duplicate words for an oid in the fti table + * save a plan for deletes + * create a function that will make the index *after* we have populated + * the main table (probably first delete all contents to be sure there's + * nothing in it, then re-populate the fti-table) + * + * can we do something with operator overloading or a seperate function + * that can build the final query automatigally? + */ + +#define MAX_FTI_QUERY_LENGTH 8192 extern Datum fti(PG_FUNCTION_ARGS); static char *breakup(char *, char *); @@ -81,10 +107,10 @@ char *StopWords[] = { /* list of words to skip in indexing */ /* stuff for caching query-plans, stolen from contrib/spi/\*.c */ typedef struct { - char *ident; - int nplans; - void **splan; -} EPlan; + char *ident; + int nplans; + void **splan; +} EPlan; static EPlan *InsertPlans = NULL; static EPlan *DeletePlans = NULL; @@ -99,7 +125,7 @@ PG_FUNCTION_INFO_V1(fti); Datum fti(PG_FUNCTION_ARGS) { - TriggerData *trigdata = (TriggerData *) fcinfo->context; + TriggerData *trigdata; Trigger *trigger; /* to get trigger name */ int nargs; /* # of arguments */ char **args; /* arguments */ @@ -111,7 +137,7 @@ fti(PG_FUNCTION_ARGS) bool isinsert = false; bool isdelete = false; int ret; - char query[8192]; + char query[MAX_FTI_QUERY_LENGTH]; Oid oid; /* @@ -124,11 +150,15 @@ fti(PG_FUNCTION_ARGS) */ if (!CALLED_AS_TRIGGER(fcinfo)) - elog(ERROR, "Full Text Indexing: not fired by trigger manager"); + elog(ERROR, "Full Text Indexing: Not fired by trigger manager"); + + /* It's safe to cast now that we've checked */ + trigdata = (TriggerData *) fcinfo->context; + if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event)) - elog(ERROR, "Full Text Indexing: can't process STATEMENT events"); + elog(ERROR, "Full Text Indexing: Can't process STATEMENT events"); if (TRIGGER_FIRED_BEFORE(trigdata->tg_event)) - elog(ERROR, "Full Text Indexing: must be fired AFTER event"); + elog(ERROR, "Full Text Indexing: Must be fired AFTER event"); if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) isinsert = true; @@ -148,11 +178,11 @@ fti(PG_FUNCTION_ARGS) rettuple = trigdata->tg_newtuple; if ((ret = SPI_connect()) < 0) - elog(ERROR, "Full Text Indexing: SPI_connect failed, returned %d\n", ret); + elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n", ret); nargs = trigger->tgnargs; - if (nargs != 2) - elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments"); + if (nargs < 2) + elog(ERROR, "Full Text Indexing: Trigger must have at least 2 arguments\n"); args = trigger->tgargs; indexname = args[0]; @@ -161,7 +191,7 @@ fti(PG_FUNCTION_ARGS) /* get oid of current tuple, needed by all, so place here */ oid = rettuple->t_data->t_oid; if (!OidIsValid(oid)) - elog(ERROR, "Full Text Indexing: oid of current tuple is NULL"); + elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid"); if (isdelete) { @@ -169,8 +199,14 @@ fti(PG_FUNCTION_ARGS) Oid *argtypes; Datum values[1]; EPlan *plan; + int i; + + snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname); + for (i = 1; i < nargs; i++) + { + snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); + } - sprintf(query, "D%s$%s", args[0], args[1]); plan = find_plan(query, &DeletePlans, &nDeletePlans); if (plan->nplans <= 0) { @@ -178,15 +214,13 @@ fti(PG_FUNCTION_ARGS) argtypes[0] = OIDOID; - sprintf(query, "DELETE FROM %s WHERE id = $1", indexname); + snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", indexname); pplan = SPI_prepare(query, 1, argtypes); if (!pplan) - elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL " - "in delete"); + elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete"); pplan = SPI_saveplan(pplan); if (pplan == NULL) - elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL " - "in delete"); + elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in delete"); plan->splan = (void **) malloc(sizeof(void *)); *(plan->splan) = pplan; @@ -197,21 +231,29 @@ fti(PG_FUNCTION_ARGS) ret = SPI_execp(*(plan->splan), values, NULL, 0); if (ret != SPI_OK_DELETE) - elog(ERROR, "Full Text Indexing: error executing plan in delete"); + elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in delete"); } if (isinsert) { - char *substring, - *column; - void *pplan; - Oid *argtypes; + char *substring; + char *column; + void *pplan; + Oid *argtypes; Datum values[2]; int colnum; - struct varlena *data; - EPlan *plan; + struct varlena *data; + EPlan *plan; + int i; + char *buff; + char *string; + + snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname); + for (i = 1; i < nargs; i++) + { + snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]); + } - sprintf(query, "I%s$%s", args[0], args[1]); plan = find_plan(query, &InsertPlans, &nInsertPlans); /* no plan yet, so allocate mem for argtypes */ @@ -224,67 +266,65 @@ fti(PG_FUNCTION_ARGS) argtypes[1] = OIDOID; /* id oid); */ /* prepare plan to gain speed */ - sprintf(query, "INSERT INTO %s (string, id) VALUES ($1, $2)", + snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES ($1, $2)", indexname); pplan = SPI_prepare(query, 2, argtypes); if (!pplan) - elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL " - "in insert"); + elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert"); pplan = SPI_saveplan(pplan); if (pplan == NULL) - elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL" - " in insert"); + elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in insert"); plan->splan = (void **) malloc(sizeof(void *)); *(plan->splan) = pplan; plan->nplans = 1; } - /* prepare plan for query */ - colnum = SPI_fnumber(tupdesc, args[1]); - if (colnum == SPI_ERROR_NOATTRIBUTE) - elog(ERROR, "Full Text Indexing: column '%s' of '%s' not found", - args[1], args[0]); - - /* Get the char* representation of the column with name args[1] */ - column = SPI_getvalue(rettuple, tupdesc, colnum); - - if (column) - { /* make sure we don't try to index NULL's */ - char *buff; - char *string = column; - - while (*string != '\0') - { - *string = tolower((unsigned char) *string); - string++; - } + for (i = 0; i < nargs - 1; i++) + { + colnum = SPI_fnumber(tupdesc, args[i + 1]); + if (colnum == SPI_ERROR_NOATTRIBUTE) + elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not found", args[i + 1], indexname); - data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1); - buff = palloc(strlen(column) + 1); - /* saves lots of calls in while-loop and in breakup() */ + /* Get the char* representation of the column */ + column = SPI_getvalue(rettuple, tupdesc, colnum); - new_tuple = true; - while ((substring = breakup(column, buff))) + /* make sure we don't try to index NULL's */ + if (column) { - int l; - - l = strlen(substring); - - data->vl_len = l + sizeof(int32); - memcpy(VARDATA(data), substring, l); - values[0] = PointerGetDatum(data); - values[1] = oid; - - ret = SPI_execp(*(plan->splan), values, NULL, 0); - if (ret != SPI_OK_INSERT) - elog(ERROR, "Full Text Indexing: error executing plan " - "in insert"); + string = column; + while (*string != '\0') + { + *string = tolower((unsigned char) *string); + string++; + } + + data = (struct varlena *) palloc(sizeof(int32) + strlen(column) + 1); + buff = palloc(strlen(column) + 1); + /* saves lots of calls in while-loop and in breakup() */ + + new_tuple = true; + + while ((substring = breakup(column, buff))) + { + int l; + + l = strlen(substring); + + data->vl_len = l + sizeof(int32); + memcpy(VARDATA(data), substring, l); + values[0] = PointerGetDatum(data); + values[1] = oid; + + ret = SPI_execp(*(plan->splan), values, NULL, 0); + if (ret != SPI_OK_INSERT) + elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in insert"); + } + pfree(buff); + pfree(data); } - pfree(buff); - pfree(data); } } diff --git a/contrib/fulltextindex/fti.pl b/contrib/fulltextindex/fti.pl index 230ba92703352a41ce9f8fb29a4d569d5e82abdb..58080a5425cbfae55473a1ef9422b964ee741b5b 100644 --- a/contrib/fulltextindex/fti.pl +++ b/contrib/fulltextindex/fti.pl @@ -17,7 +17,7 @@ # # Example: # -# fti.pl -u -d mydb -t mytable -c mycolumn -f myfile +# fti.pl -u -d mydb -t mytable -c mycolumn,mycolumn2 -f myfile # sort -o myoutfile myfile # uniq myoutfile sorted-file # @@ -140,11 +140,13 @@ sub main { getopts('d:t:c:f:u'); if (!$opt_d || !$opt_t || !$opt_c || !$opt_f) { - print STDERR "usage: $0 [-u] -d database -t table -c column ". + print STDERR "usage: $0 [-u] -d database -t table -c column[,column...] ". "-f output-file\n"; return 1; } + @cols = split(/,/, $opt_c); + if (defined($opt_u)) { $uname = get_username(); $pwd = get_password(); @@ -166,7 +168,9 @@ sub main { PQexec($PG_CONN, "begin"); - $query = "declare C cursor for select $opt_c, oid from $opt_t"; + $query = "declare C cursor for select (\""; + $query .= join("\" || ' ' || \"", @cols); + $query .= "\") as string, oid from $opt_t"; $res = PQexec($PG_CONN, $query); if (!$res || (PQresultStatus($res) != $PGRES_COMMAND_OK)) { print STDERR "Error declaring cursor!\n";