1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-27 12:41:57 +03:00

Attached is an update to contrib/tablefunc. It implements a new hashed

version of crosstab. This fixes a major deficiency in real-world use of
the original version. Easiest to undestand with an illustration:

Data:
-------------------------------------------------------------------
select * from cth;
  id | rowid |        rowdt        |   attribute    |      val
----+-------+---------------------+----------------+---------------
   1 | test1 | 2003-03-01 00:00:00 | temperature    | 42
   2 | test1 | 2003-03-01 00:00:00 | test_result    | PASS
   3 | test1 | 2003-03-01 00:00:00 | volts          | 2.6987
   4 | test2 | 2003-03-02 00:00:00 | temperature    | 53
   5 | test2 | 2003-03-02 00:00:00 | test_result    | FAIL
   6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003
   7 | test2 | 2003-03-02 00:00:00 | volts          | 3.1234
(7 rows)

Original crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
   'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4)
AS c(rowid text, temperature text, test_result text, test_startdate
text, volts text);
  rowid | temperature | test_result | test_startdate | volts
-------+-------------+-------------+----------------+--------
  test1 | 42          | PASS        | 2.6987         |
  test2 | 53          | FAIL        | 01 March 2003  | 3.1234
(2 rows)

Hashed crosstab:
-------------------------------------------------------------------
SELECT * FROM crosstab(
   'SELECT rowid, attribute, val FROM cth ORDER BY 1',
   'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature int4, test_result text, test_startdate
timestamp, volts float8);
  rowid | temperature | test_result |   test_startdate    | volts
-------+-------------+-------------+---------------------+--------
  test1 |          42 | PASS        |                     | 2.6987
  test2 |          53 | FAIL        | 2003-03-01 00:00:00 | 3.1234
(2 rows)

Notice that the original crosstab slides data over to the left in the
result tuple when it encounters missing data. In order to work around
this you have to be make your source sql do all sorts of contortions
(cartesian join of distinct rowid with distinct attribute; left join
that back to the real source data). The new version avoids this by
building a hash table using a second distinct attribute query.

The new version also allows for "extra" columns (see the README) and
allows the result columns to be coerced into differing datatypes if they
are suitable (as shown above).

In testing a "real-world" data set (69 distinct rowid's, 27 distinct
categories/attributes, multiple missing data points) I saw about a
5-fold improvement in execution time (from about 2200 ms old, to 440 ms
new).

I left the original version intact because: 1) BC, 2) it is probably
slightly faster if you know that you have no missing attributes.

README and regression test adjustments included. If there are no
objections, please apply.

Joe Conway
This commit is contained in:
Bruce Momjian
2003-03-20 06:46:30 +00:00
parent add932ee91
commit 64d0b8b05f
6 changed files with 655 additions and 1 deletions

View File

@ -38,6 +38,61 @@ SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;', 3) AS c(rowid text, att1 text, att2 text, att3 text);
SELECT * FROM crosstab('SELECT rowid, attribute, val FROM ct where rowclass = ''group1'' ORDER BY 1,2;', 4) AS c(rowid text, att1 text, att2 text, att3 text, att4 text);
--
-- hash based crosstab
--
create table cth(id serial, rowid text, rowdt timestamp, attribute text, val text);
insert into cth values(DEFAULT,'test1','01 March 2003','temperature','42');
insert into cth values(DEFAULT,'test1','01 March 2003','test_result','PASS');
-- the next line is intentionally left commented and is therefore a "missing" attribute
-- insert into cth values(DEFAULT,'test1','01 March 2003','test_startdate','28 February 2003');
insert into cth values(DEFAULT,'test1','01 March 2003','volts','2.6987');
insert into cth values(DEFAULT,'test2','02 March 2003','temperature','53');
insert into cth values(DEFAULT,'test2','02 March 2003','test_result','FAIL');
insert into cth values(DEFAULT,'test2','02 March 2003','test_startdate','01 March 2003');
insert into cth values(DEFAULT,'test2','02 March 2003','volts','3.1234');
-- return attributes as plain text
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature text, test_result text, test_startdate text, volts text);
-- this time without rowdt
SELECT * FROM crosstab(
'SELECT rowid, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, temperature text, test_result text, test_startdate text, volts text);
-- convert attributes to specific datatypes
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
-- source query and category query out of sync
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth WHERE attribute IN (''temperature'',''test_result'',''test_startdate'') ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp);
-- if category query generates no rows, get expected error
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT attribute FROM cth WHERE attribute = ''a'' ORDER BY 1')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
-- if category query generates more than one column, get expected error
SELECT * FROM crosstab(
'SELECT rowid, rowdt, attribute, val FROM cth ORDER BY 1',
'SELECT DISTINCT rowdt, attribute FROM cth ORDER BY 2')
AS c(rowid text, rowdt timestamp, temperature int4, test_result text, test_startdate timestamp, volts float8);
--
-- connectby
--
-- test connectby with text based hierarchy
CREATE TABLE connectby_text(keyid text, parent_keyid text);
\copy connectby_text from 'data/connectby_text.data'