atom feed88 messages in org.postgresql.pgsql-hackers[HACKERS] wip: functions median and p...
FromSent OnAttachments
Pavel StehuleAug 19, 2010 3:59 am.diff
Greg StarkAug 19, 2010 7:50 am 
Tom LaneAug 19, 2010 8:32 am 
Robert HaasAug 19, 2010 9:49 am 
David FetterAug 19, 2010 10:02 am 
Kevin GrittnerAug 19, 2010 10:11 am 
David FetterAug 19, 2010 10:14 am 
Tom LaneAug 19, 2010 10:25 am 
Pavel StehuleAug 19, 2010 10:37 am 
Pavel StehuleAug 19, 2010 10:41 am 
David FetterAug 19, 2010 10:47 am 
Kevin GrittnerAug 19, 2010 11:17 am 
Tom LaneAug 19, 2010 11:17 am 
Hitoshi HaradaSep 19, 2010 8:46 pm 
Pavel StehuleSep 21, 2010 1:27 pm.diff
Robert HaasSep 21, 2010 1:43 pm 
Pavel StehuleSep 21, 2010 2:03 pm.diff
Hitoshi HaradaSep 21, 2010 7:20 pm 
Hitoshi HaradaSep 21, 2010 8:06 pm 
Pavel StehuleSep 21, 2010 9:36 pm.diff
Pavel StehuleSep 21, 2010 9:44 pm 
Pavel StehuleSep 23, 2010 2:35 am 
Pavel StehuleSep 23, 2010 6:22 am.diff
Pavel StehuleSep 23, 2010 6:25 am.diff
Hitoshi HaradaSep 23, 2010 10:45 am 
Pavel StehuleSep 23, 2010 11:27 am 
62 later messages
Subject:[HACKERS] wip: functions median and percentile
From:Pavel Stehule (pave@gmail.com)
Date:Aug 19, 2010 3:59:10 am
List:org.postgresql.pgsql-hackers
Attachments:

Hello

I am sending a prototype implementation of functions median and percentile. This implementation is very simple and I moved it to contrib for this moment - it is more easy maintainable. Later I'll move it to core.

These functions are relative simple, there are not barrier for implementation own specific mutations of this functions - so I propose move to core only basic and well known form of these to core.

postgres=# select median(v) from generate_series(1,10) g(v); median ──────── 5.5 (1 row)

Time: 1.475 ms postgres=# select percentile(v,50) from generate_series(1,10) g(v); percentile ──────────── 5 (1 row)

Time: 0.626 ms

This implementation is based on tuplesort and the speed is relative well - the result from 1000000 rows is less 1 sec.

Regards

*** ./contrib/median/Makefile.orig 2010-08-19 12:38:56.144777253 +0200 --- ./contrib/median/Makefile 2010-08-18 20:23:39.180156339 +0200 *************** *** 0 **** --- 1,17 ---- + # $PostgreSQL: pgsql/contrib/median/Makefile,v 1.1 2008/07/29 18:31:20 tgl Exp
$ + + MODULES = median + DATA_built = median.sql + DATA = uninstall_median.sql + REGRESS = median + + ifdef USE_PGXS + PG_CONFIG = pg_config + PGXS := $(shell $(PG_CONFIG) --pgxs) + include $(PGXS) + else + subdir = contrib/median + top_builddir = ../.. + include $(top_builddir)/src/Makefile.global + include $(top_srcdir)/contrib/contrib-global.mk + endif *** ./contrib/median/median.c.orig 2010-08-19 12:39:01.456650776 +0200 --- ./contrib/median/median.c 2010-08-19 12:35:32.104649418 +0200 *************** *** 0 **** --- 1,244 ---- + /* + * $PostgreSQL: pgsql/contrib/citext/citext.c,v 1.2 2009/06/11 14:48:50
momjian Exp $ + */ + #include "postgres.h" + + #include "funcapi.h" + #include "miscadmin.h" + #include "catalog/pg_type.h" + #include "parser/parse_coerce.h" + #include "parser/parse_oper.h" + #include "utils/builtins.h" + #include "utils/tuplesort.h" + + Datum median_transfn(PG_FUNCTION_ARGS); + Datum median_finalfn(PG_FUNCTION_ARGS); + Datum percentile_transfn(PG_FUNCTION_ARGS); + Datum percentile_finalfn(PG_FUNCTION_ARGS); + + + #ifdef PG_MODULE_MAGIC + PG_MODULE_MAGIC; + #endif + + PG_FUNCTION_INFO_V1(median_transfn); + PG_FUNCTION_INFO_V1(median_finalfn); + PG_FUNCTION_INFO_V1(percentile_transfn); + PG_FUNCTION_INFO_V1(percentile_finalfn); + + + typedef struct + { + int nelems; /* number of valid entries */ + Tuplesortstate *sortstate; + FmgrInfo cast_func_finfo; + int p; /* nth for percentille */ + } StatAggState; + + static StatAggState * + makeStatAggState(FunctionCallInfo fcinfo) + { + MemoryContext oldctx; + MemoryContext aggcontext; + StatAggState *aggstate; + Oid sortop, + castfunc; + Oid valtype; + CoercionPathType pathtype; + + if (!AggCheckCallContext(fcinfo, &aggcontext)) + { + /* cannot be called directly because of internal-type argument */ + elog(ERROR, "string_agg_transfn called in non-aggregate context"); + } + + oldctx = MemoryContextSwitchTo(aggcontext); + + aggstate = (StatAggState *) palloc(sizeof(StatAggState)); + aggstate->nelems = 0; + + valtype = get_fn_expr_argtype(fcinfo->flinfo, 1); + get_sort_group_operators(valtype, + true, false, false, + &sortop, NULL, NULL); + + aggstate->sortstate = tuplesort_begin_datum(valtype, + sortop, + SORTBY_NULLS_DEFAULT, + work_mem, false); + + MemoryContextSwitchTo(oldctx); + + if (valtype != FLOAT8OID) + { + /* find a cast function */ + + pathtype = find_coercion_pathway(FLOAT8OID, valtype, + COERCION_EXPLICIT, + &castfunc); + if (pathtype == COERCION_PATH_FUNC) + { + Assert(OidIsValid(castfunc)); + fmgr_info_cxt(castfunc, &aggstate->cast_func_finfo, + aggcontext); + } + else if (pathtype == COERCION_PATH_RELABELTYPE) + { + aggstate->cast_func_finfo.fn_oid = InvalidOid; + } + else + elog(ERROR, "no conversion function from %s %s", + format_type_be(valtype), + format_type_be(FLOAT8OID)); + } + + return aggstate; + } + + /* + * append a non NULL value to tuplesort + */ + Datum + median_transfn(PG_FUNCTION_ARGS) + { + StatAggState *aggstate; + + aggstate = PG_ARGISNULL(0) ? NULL : (StatAggState *) PG_GETARG_POINTER(0); + + if (!PG_ARGISNULL(1)) + { + if (aggstate == NULL) + aggstate = makeStatAggState(fcinfo); + + tuplesort_putdatum(aggstate->sortstate, PG_GETARG_DATUM(1), false); + aggstate->nelems++; + } + + PG_RETURN_POINTER(aggstate); + } + + static double + to_double(Datum value, FmgrInfo *cast_func_finfo) + { + if (cast_func_finfo->fn_oid != InvalidOid) + { + return DatumGetFloat8(FunctionCall1(cast_func_finfo, value)); + } + else + return DatumGetFloat8(value); + } + + Datum + median_finalfn(PG_FUNCTION_ARGS) + { + StatAggState *aggstate; + + Assert(AggCheckCallContext(fcinfo, NULL)); + + aggstate = PG_ARGISNULL(0) ? NULL : (StatAggState *) PG_GETARG_POINTER(0); + + if (aggstate != NULL) + { + int lidx; + int hidx; + Datum value; + bool isNull; + int i = 1; + double result = 0; + + hidx = aggstate->nelems / 2 + 1; + lidx = (aggstate->nelems + 1) / 2; + + tuplesort_performsort(aggstate->sortstate); + + while (tuplesort_getdatum(aggstate->sortstate, + true, + &value, &isNull)) + { + if (i++ == lidx) + { + result = to_double(value, &aggstate->cast_func_finfo); + + if (lidx != hidx) + { + tuplesort_getdatum(aggstate->sortstate, + true, + &value, &isNull); + result = (result + to_double(value, &aggstate->cast_func_finfo)) / 2.0; + } + break; + } + } + + tuplesort_end(aggstate->sortstate); + + PG_RETURN_FLOAT8(result); + } + else + PG_RETURN_NULL(); + } + + /* + * append a non NULL value to tuplesort + */ + Datum + percentile_transfn(PG_FUNCTION_ARGS) + { + StatAggState *aggstate; + + aggstate = PG_ARGISNULL(0) ? NULL : (StatAggState *) PG_GETARG_POINTER(0); + + if (!PG_ARGISNULL(1)) + { + if (aggstate == NULL) + { + aggstate = makeStatAggState(fcinfo); + aggstate->p = PG_GETARG_INT32(2); + } + + tuplesort_putdatum(aggstate->sortstate, PG_GETARG_DATUM(1), false); + aggstate->nelems++; + } + + PG_RETURN_POINTER(aggstate); + } + + Datum + percentile_finalfn(PG_FUNCTION_ARGS) + { + StatAggState *aggstate; + + Assert(AggCheckCallContext(fcinfo, NULL)); + + aggstate = PG_ARGISNULL(0) ? NULL : (StatAggState *) PG_GETARG_POINTER(0); + + if (aggstate != NULL) + { + Datum value; + bool isNull; + int i = 1; + double result = 0; + int n; + + n = ((aggstate->p / 100.0) * (aggstate->nelems - 1)) + 1; + + tuplesort_performsort(aggstate->sortstate); + + while (tuplesort_getdatum(aggstate->sortstate, + true, + &value, &isNull)) + { + if (i++ == n) + { + result = to_double(value, &aggstate->cast_func_finfo); + break; + } + } + + tuplesort_end(aggstate->sortstate); + + PG_RETURN_FLOAT8(result); + } + else + PG_RETURN_NULL(); + } *** ./contrib/median/median.sql.in.orig 2010-08-19 12:39:06.192775857 +0200 --- ./contrib/median/median.sql.in 2010-08-19 12:28:24.230774219 +0200 *************** *** 0 **** --- 1,31 ---- + CREATE OR REPLACE FUNCTION median_transfn(internal, anyelement) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C IMMUTABLE; + + CREATE OR REPLACE FUNCTION median_finalfn(internal) + RETURNS double precision + AS 'MODULE_PATHNAME' + LANGUAGE C IMMUTABLE; + + CREATE AGGREGATE median(anyelement) ( + SFUNC=median_transfn, + STYPE=internal, + FINALFUNC=median_finalfn + ); + + CREATE OR REPLACE FUNCTION percentile_transfn(internal, anyelement, p integer) + RETURNS internal + AS 'MODULE_PATHNAME' + LANGUAGE C IMMUTABLE; + + CREATE OR REPLACE FUNCTION percentile_finalfn(internal) + RETURNS double precision + AS 'MODULE_PATHNAME' + LANGUAGE C IMMUTABLE; + + CREATE AGGREGATE percentile(anyelement, int) ( + SFUNC=percentile_transfn, + STYPE=internal, + FINALFUNC=percentile_finalfn + ); *** ./contrib/median/uninstall_median.sql.orig 2010-08-19 12:39:11.712777158
+0200 --- ./contrib/median/uninstall_median.sql 2010-08-19 12:37:25.800652539 +0200 *************** *** 0 **** --- 1,6 ---- + DROP FUNCTION median_transfn(internal, anyelement); + DROP FUNCTION median_finalfn(internal); + DROP AGGREGATE median(anyelement); + DROP FUNCTION percentile_transfn(internal, anyelement, p integer); + DROP FUNCTION percentile_finalfn(internal); + DROP AGGREGATE percentile(anyelement, int);