mirror of
https://github.com/MariaDB/server.git
synced 2025-07-29 05:21:33 +03:00
MDEV-9736: Window functions: multiple cursors to read filesort result
Add support for having multiple IO_CACHEs with type=READ_CACHE to share the file they are reading from. Each IO_CACHE keeps its own in-memory buffer. When doing a read or seek operation on the file, it notifies other IO_CACHEs that the file position has been changed. Make Rowid_seq_cursor use cloned IO_CACHE when reading filesort result.
This commit is contained in:
committed by
Vicențiu Ciorbaru
parent
6e4015727a
commit
047963922c
@ -472,6 +472,8 @@ typedef struct st_io_cache /* Used when cacheing files */
|
||||
const char *dir;
|
||||
char prefix[3];
|
||||
File file; /* file descriptor */
|
||||
|
||||
struct st_io_cache *next_file_user;
|
||||
/*
|
||||
seek_not_done is set by my_b_seek() to inform the upcoming read/write
|
||||
operation that a seek needs to be preformed prior to the actual I/O
|
||||
@ -802,6 +804,11 @@ extern my_bool reinit_io_cache(IO_CACHE *info,enum cache_type type,
|
||||
extern void setup_io_cache(IO_CACHE* info);
|
||||
extern void init_io_cache_share(IO_CACHE *read_cache, IO_CACHE_SHARE *cshare,
|
||||
IO_CACHE *write_cache, uint num_threads);
|
||||
|
||||
extern int init_slave_io_cache(IO_CACHE *master, IO_CACHE *slave);
|
||||
void end_slave_io_cache(IO_CACHE *cache);
|
||||
void seek_io_cache(IO_CACHE *cache, my_off_t needed_offset);
|
||||
|
||||
extern void remove_io_thread(IO_CACHE *info);
|
||||
extern int _my_b_async_read(IO_CACHE *info,uchar *Buffer,size_t Count);
|
||||
extern int my_b_append(IO_CACHE *info,const uchar *Buffer,size_t Count);
|
||||
|
93
mysql-test/r/win_big.result
Normal file
93
mysql-test/r/win_big.result
Normal file
@ -0,0 +1,93 @@
|
||||
create table t0 (a int);
|
||||
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
|
||||
create table t1(a int);
|
||||
insert into t1 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
|
||||
create table t10 (a int, b int, c int);
|
||||
insert into t10
|
||||
select
|
||||
A.a + 1000*B.a,
|
||||
A.a + 1000*B.a,
|
||||
A.a + 1000*B.a
|
||||
from t1 A, t0 B
|
||||
order by A.a+1000*B.a;
|
||||
#################################################################
|
||||
## Try a basic example
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 0
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 35
|
||||
include/diff_tables.inc [t21, t22]
|
||||
drop table t21, t22;
|
||||
#################################################################
|
||||
# Try many cursors
|
||||
set sort_buffer_size=default;
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B1,
|
||||
sum(b) over (order by a rows between 5 preceding and 5 following) as SUM_B2,
|
||||
sum(b) over (order by a rows between 20 preceding and 20 following) as SUM_B3
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 0
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B1,
|
||||
sum(b) over (order by a rows between 5 preceding and 5 following) as SUM_B2,
|
||||
sum(b) over (order by a rows between 20 preceding and 20 following) as SUM_B3
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 35
|
||||
include/diff_tables.inc [t21, t22]
|
||||
drop table t21, t22;
|
||||
#################################################################
|
||||
# Try having cursors pointing at different IO_CACHE pages
|
||||
# in the IO_CACHE
|
||||
set sort_buffer_size=default;
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
a,
|
||||
sum(b) over (order by a range between 5000 preceding and 5000 following) as SUM_B1
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 0
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
a,
|
||||
sum(b) over (order by a range between 5000 preceding and 5000 following) as SUM_B1
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
Variable_name Value
|
||||
Sort_merge_passes 35
|
||||
include/diff_tables.inc [t21, t22]
|
||||
drop table t21, t22;
|
||||
#################################################################
|
||||
drop table t10;
|
||||
drop table t0,t1;
|
104
mysql-test/t/win_big.test
Normal file
104
mysql-test/t/win_big.test
Normal file
@ -0,0 +1,104 @@
|
||||
#
|
||||
# Tests for window functions over big datasets.
|
||||
# "Big" here is "big enough so that filesort result doesn't fit in a
|
||||
# memory buffer".
|
||||
#
|
||||
#
|
||||
|
||||
create table t0 (a int);
|
||||
insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
|
||||
|
||||
create table t1(a int);
|
||||
insert into t1 select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
|
||||
|
||||
create table t10 (a int, b int, c int);
|
||||
insert into t10
|
||||
select
|
||||
A.a + 1000*B.a,
|
||||
A.a + 1000*B.a,
|
||||
A.a + 1000*B.a
|
||||
from t1 A, t0 B
|
||||
order by A.a+1000*B.a;
|
||||
|
||||
--echo #################################################################
|
||||
--echo ## Try a basic example
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
let $diff_tables= t21, t22;
|
||||
source include/diff_tables.inc;
|
||||
drop table t21, t22;
|
||||
|
||||
--echo #################################################################
|
||||
--echo # Try many cursors
|
||||
set sort_buffer_size=default;
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B1,
|
||||
sum(b) over (order by a rows between 5 preceding and 5 following) as SUM_B2,
|
||||
sum(b) over (order by a rows between 20 preceding and 20 following) as SUM_B3
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
sum(b) over (order by a rows between 2 preceding and 2 following) as SUM_B1,
|
||||
sum(b) over (order by a rows between 5 preceding and 5 following) as SUM_B2,
|
||||
sum(b) over (order by a rows between 20 preceding and 20 following) as SUM_B3
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
let $diff_tables= t21, t22;
|
||||
source include/diff_tables.inc;
|
||||
drop table t21, t22;
|
||||
|
||||
--echo #################################################################
|
||||
--echo # Try having cursors pointing at different IO_CACHE pages
|
||||
--echo # in the IO_CACHE
|
||||
set sort_buffer_size=default;
|
||||
flush status;
|
||||
create table t21 as
|
||||
select
|
||||
a,
|
||||
sum(b) over (order by a range between 5000 preceding and 5000 following) as SUM_B1
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
set sort_buffer_size=1024;
|
||||
flush status;
|
||||
create table t22 as
|
||||
select
|
||||
a,
|
||||
sum(b) over (order by a range between 5000 preceding and 5000 following) as SUM_B1
|
||||
from
|
||||
t10;
|
||||
show status like 'Sort_merge_passes';
|
||||
|
||||
let $diff_tables= t21, t22;
|
||||
source include/diff_tables.inc;
|
||||
drop table t21, t22;
|
||||
--echo #################################################################
|
||||
|
||||
drop table t10;
|
||||
drop table t0,t1;
|
||||
|
@ -193,6 +193,7 @@ int init_io_cache(IO_CACHE *info, File file, size_t cachesize,
|
||||
info->alloced_buffer = 0;
|
||||
info->buffer=0;
|
||||
info->seek_not_done= 0;
|
||||
info->next_file_user= NULL;
|
||||
|
||||
if (file >= 0)
|
||||
{
|
||||
@ -328,6 +329,101 @@ int init_io_cache(IO_CACHE *info, File file, size_t cachesize,
|
||||
DBUG_RETURN(0);
|
||||
} /* init_io_cache */
|
||||
|
||||
|
||||
|
||||
/*
|
||||
Initialize the slave IO_CACHE to read the same file (and data)
|
||||
as master does.
|
||||
|
||||
One can create multiple slaves from a single master. Every slave and master
|
||||
will have independent file positions.
|
||||
|
||||
The master must be a non-shared READ_CACHE.
|
||||
It is assumed that no more reads are done after a master and/or a slave
|
||||
has been freed (this limitation can be easily lifted).
|
||||
*/
|
||||
|
||||
int init_slave_io_cache(IO_CACHE *master, IO_CACHE *slave)
|
||||
{
|
||||
uchar *slave_buf;
|
||||
DBUG_ASSERT(master->type == READ_CACHE);
|
||||
DBUG_ASSERT(!master->share);
|
||||
DBUG_ASSERT(master->alloced_buffer);
|
||||
|
||||
if (!(slave_buf= (uchar*)my_malloc(master->buffer_length, MYF(0))))
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
memcpy(slave, master, sizeof(IO_CACHE));
|
||||
slave->buffer= slave_buf;
|
||||
|
||||
memcpy(slave->buffer, master->buffer, master->buffer_length);
|
||||
slave->read_pos= slave->buffer + (master->read_pos - master->buffer);
|
||||
slave->read_end= slave->buffer + (master->read_end - master->buffer);
|
||||
|
||||
DBUG_ASSERT(master->current_pos == &master->read_pos);
|
||||
slave->current_pos= &slave->read_pos;
|
||||
DBUG_ASSERT(master->current_end == &master->read_end);
|
||||
slave->current_end= &slave->read_end;
|
||||
|
||||
if (master->next_file_user)
|
||||
{
|
||||
IO_CACHE *p;
|
||||
for (p= master->next_file_user;
|
||||
p->next_file_user !=master;
|
||||
p= p->next_file_user)
|
||||
{}
|
||||
|
||||
p->next_file_user= slave;
|
||||
slave->next_file_user= master;
|
||||
}
|
||||
else
|
||||
{
|
||||
slave->next_file_user= master;
|
||||
master->next_file_user= slave;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void end_slave_io_cache(IO_CACHE *cache)
|
||||
{
|
||||
my_free(cache->buffer);
|
||||
}
|
||||
|
||||
/*
|
||||
Seek a read io cache to a given offset
|
||||
*/
|
||||
void seek_io_cache(IO_CACHE *cache, my_off_t needed_offset)
|
||||
{
|
||||
my_off_t cached_data_start= cache->pos_in_file;
|
||||
my_off_t cached_data_end= cache->pos_in_file + (cache->read_pos -
|
||||
cache->buffer);
|
||||
if (needed_offset >= cached_data_start &&
|
||||
needed_offset < cached_data_end)
|
||||
{
|
||||
/*
|
||||
The offset we're seeking to is in the buffer.
|
||||
Move buffer's read position accordingly
|
||||
*/
|
||||
cache->read_pos= cache->buffer + (needed_offset - cached_data_start);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (needed_offset > cache->end_of_file)
|
||||
needed_offset= cache->end_of_file;
|
||||
/*
|
||||
The offset we're seeking to is not in the buffer.
|
||||
- Set the buffer to be exhausted.
|
||||
- Make the next read to a mysql_file_seek() call to the required
|
||||
offset (but still use aligned reads).
|
||||
*/
|
||||
cache->read_pos= cache->read_end;
|
||||
cache->seek_not_done= 1;
|
||||
cache->pos_in_file= (needed_offset / IO_SIZE) * IO_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
/* Wait until current request is ready */
|
||||
|
||||
#ifdef HAVE_AIOWAIT
|
||||
@ -583,6 +679,17 @@ int _my_b_cache_read(IO_CACHE *info, uchar *Buffer, size_t Count)
|
||||
{
|
||||
/* No error, reset seek_not_done flag. */
|
||||
info->seek_not_done= 0;
|
||||
|
||||
if (info->next_file_user)
|
||||
{
|
||||
IO_CACHE *c;
|
||||
for (c= info->next_file_user;
|
||||
c!= info;
|
||||
c= c->next_file_user)
|
||||
{
|
||||
c->seek_not_done= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -671,7 +778,19 @@ int _my_b_cache_read(IO_CACHE *info, uchar *Buffer, size_t Count)
|
||||
DBUG_RETURN(0); /* EOF */
|
||||
}
|
||||
}
|
||||
else if ((length= mysql_file_read(info->file,info->buffer, max_length,
|
||||
else
|
||||
{
|
||||
if (info->next_file_user)
|
||||
{
|
||||
IO_CACHE *c;
|
||||
for (c= info->next_file_user;
|
||||
c!= info;
|
||||
c= c->next_file_user)
|
||||
{
|
||||
c->seek_not_done= 1;
|
||||
}
|
||||
}
|
||||
if ((length= mysql_file_read(info->file,info->buffer, max_length,
|
||||
info->myflags)) < Count ||
|
||||
length == (size_t) -1)
|
||||
{
|
||||
@ -688,6 +807,7 @@ int _my_b_cache_read(IO_CACHE *info, uchar *Buffer, size_t Count)
|
||||
info->seek_not_done=1;
|
||||
DBUG_RETURN(1);
|
||||
}
|
||||
}
|
||||
/*
|
||||
Count is the remaining number of bytes requested.
|
||||
length is the amount of data in the cache.
|
||||
|
@ -515,17 +515,6 @@ void order_window_funcs_by_window_specs(List<Item_window_func> *win_func_list)
|
||||
// note: make rr_from_pointers static again when not need it here anymore
|
||||
int rr_from_pointers(READ_RECORD *info);
|
||||
|
||||
/*
|
||||
A temporary way to clone READ_RECORD structures until Monty provides the real
|
||||
one.
|
||||
*/
|
||||
bool clone_read_record(const READ_RECORD *src, READ_RECORD *dst)
|
||||
{
|
||||
//DBUG_ASSERT(src->table->sort.record_pointers);
|
||||
DBUG_ASSERT(src->read_record == rr_from_pointers);
|
||||
memcpy(dst, src, sizeof(READ_RECORD));
|
||||
return false;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -540,68 +529,145 @@ bool clone_read_record(const READ_RECORD *src, READ_RECORD *dst)
|
||||
class Rowid_seq_cursor
|
||||
{
|
||||
public:
|
||||
virtual ~Rowid_seq_cursor() {}
|
||||
Rowid_seq_cursor() : io_cache(NULL), ref_buffer(0) {}
|
||||
virtual ~Rowid_seq_cursor()
|
||||
{
|
||||
if (ref_buffer)
|
||||
my_free(ref_buffer);
|
||||
if (io_cache)
|
||||
{
|
||||
end_slave_io_cache(io_cache);
|
||||
my_free(io_cache);
|
||||
io_cache= NULL;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/* Length of one rowid element */
|
||||
size_t ref_length;
|
||||
|
||||
/* If io_cache=!NULL, use it */
|
||||
IO_CACHE *io_cache;
|
||||
uchar *ref_buffer; /* Buffer for the last returned rowid */
|
||||
uint rownum; /* Number of the rowid that is about to be returned */
|
||||
bool cache_eof; /* whether we've reached EOF */
|
||||
|
||||
/* The following are used when we are reading from an array of pointers */
|
||||
uchar *cache_start;
|
||||
uchar *cache_pos;
|
||||
uchar *cache_end;
|
||||
public:
|
||||
|
||||
void init(READ_RECORD *info)
|
||||
{
|
||||
ref_length= info->ref_length;
|
||||
if (info->read_record == rr_from_pointers)
|
||||
{
|
||||
io_cache= NULL;
|
||||
cache_start= info->cache_pos;
|
||||
cache_pos= info->cache_pos;
|
||||
cache_end= info->cache_end;
|
||||
ref_length= info->ref_length;
|
||||
}
|
||||
else
|
||||
{
|
||||
//DBUG_ASSERT(info->read_record == rr_from_tempfile);
|
||||
rownum= 0;
|
||||
cache_eof= false;
|
||||
io_cache= (IO_CACHE*)my_malloc(sizeof(IO_CACHE), MYF(0));
|
||||
init_slave_io_cache(info->io_cache, io_cache);
|
||||
|
||||
ref_buffer= (uchar*)my_malloc(ref_length, MYF(0));
|
||||
}
|
||||
}
|
||||
|
||||
virtual int next()
|
||||
{
|
||||
if (io_cache)
|
||||
{
|
||||
if (cache_eof)
|
||||
return 1;
|
||||
|
||||
if (my_b_read(io_cache,ref_buffer,ref_length))
|
||||
{
|
||||
cache_eof= 1; // TODO: remove cache_eof
|
||||
return -1;
|
||||
}
|
||||
rownum++;
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Allow multiple next() calls in EOF state. */
|
||||
if (cache_pos == cache_end)
|
||||
return -1;
|
||||
|
||||
cache_pos+= ref_length;
|
||||
DBUG_ASSERT(cache_pos <= cache_end);
|
||||
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
virtual int prev()
|
||||
{
|
||||
if (io_cache)
|
||||
{
|
||||
if (rownum == 0)
|
||||
return -1;
|
||||
|
||||
move_to(rownum - 1);
|
||||
return 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Allow multiple prev() calls when positioned at the start. */
|
||||
if (cache_pos == cache_start)
|
||||
return -1;
|
||||
cache_pos-= ref_length;
|
||||
DBUG_ASSERT(cache_pos >= cache_start);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
ha_rows get_rownum() const
|
||||
{
|
||||
if (io_cache)
|
||||
return rownum;
|
||||
else
|
||||
return (cache_pos - cache_start) / ref_length;
|
||||
}
|
||||
|
||||
void move_to(ha_rows row_number)
|
||||
{
|
||||
if (io_cache)
|
||||
{
|
||||
seek_io_cache(io_cache, row_number * ref_length);
|
||||
rownum= row_number;
|
||||
next();
|
||||
}
|
||||
else
|
||||
{
|
||||
cache_pos= MY_MIN(cache_end, cache_start + row_number * ref_length);
|
||||
DBUG_ASSERT(cache_pos <= cache_end);
|
||||
}
|
||||
|
||||
protected:
|
||||
bool at_eof() { return (cache_pos == cache_end); }
|
||||
|
||||
uchar *get_prev_rowid()
|
||||
{
|
||||
if (cache_pos == cache_start)
|
||||
return NULL;
|
||||
else
|
||||
return cache_pos - ref_length;
|
||||
}
|
||||
|
||||
uchar *get_curr_rowid() { return cache_pos; }
|
||||
protected:
|
||||
bool at_eof()
|
||||
{
|
||||
if (io_cache)
|
||||
{
|
||||
return cache_eof;
|
||||
}
|
||||
else
|
||||
return (cache_pos == cache_end);
|
||||
}
|
||||
|
||||
private:
|
||||
uchar *cache_start;
|
||||
uchar *cache_pos;
|
||||
uchar *cache_end;
|
||||
uint ref_length;
|
||||
uchar *get_curr_rowid()
|
||||
{
|
||||
if (io_cache)
|
||||
return ref_buffer;
|
||||
else
|
||||
return cache_pos;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user