mariadb/mysql-test/main/subselect_mat_cost.test

#
# Tests of cost-based choice between the materialization and in-to-exists
# subquery execution strategies (MWL#89)
#
# The test file is divided into two groups of tests:
# A. Typical cases when either of the two strategies is selected:
#    1. Subquery in disjunctive WHERE clause of the outer query.
#    2. NOT IN subqueries
#    3. Subqueries with GROUP BY, HAVING, and aggregate functions
#    4. Subqueries in the SELECT and HAVING clauses
#    5. Subqueries with UNION
# B. Reasonably exhaustive tests of the various combinations of optimizer
#    switches, data distribution, available indexes, and typical queries.
#

set @subselect_mat_cost=@@optimizer_switch;
set optimizer_switch='mrr=on,mrr_sort_keys=on,index_condition_pushdown=on';
#
# Test logging to slow log (there was some errors in the log files about
# the slow log when running under valgrind, so better to get this tested)
#
set long_query_time=0.1;


-- echo TEST GROUP 1:
-- echo Typical cases of in-to-exists and materialization subquery strategies
-- echo =====================================================================

--disable_warnings
drop database if exists world;
--enable_warnings

set names utf8;

create database world;
use world;

--source include/world_schema.inc
--disable_query_log
--disable_result_log
--disable_warnings
--source include/world.inc
--enable_warnings
--enable_result_log
--enable_query_log

-- echo Make the schema and data more diverse by adding more indexes, nullable
-- echo columns, and NULL data.
create index SurfaceArea on Country(SurfaceArea);
create index Language on CountryLanguage(Language);
create index CityName on City(Name);
alter table City change population population int(11) null default 0;

select max(id) from City into @max_city_id;
insert into City values (@max_city_id + 1,'Kilifarevo','BGR',NULL);


SELECT COUNT(*) FROM Country;
SELECT COUNT(*) FROM City;
SELECT COUNT(*) FROM CountryLanguage;

set @@optimizer_switch = 'in_to_exists=on,semijoin=on,materialization=on,partial_match_rowid_merge=on,partial_match_table_scan=on,subquery_cache=on';

-- echo
-- echo 1. Subquery in a disjunctive WHERE clause of the outer query.
-- echo

-- echo
-- echo Q1.1m:
-- echo MATERIALIZATION: there are too many rows in the outer query
-- echo to be looked up in the inner table.
EXPLAIN
SELECT Name FROM Country
WHERE (Code IN (select Country from City where City.Population > 100000) OR
       Name LIKE 'L%') AND
      surfacearea > 1000000;

SELECT Name FROM Country
WHERE (Code IN (select Country from City where City.Population > 100000) OR
       Name LIKE 'L%') AND
      surfacearea > 1000000;

-- echo Q1.1e:
-- echo IN-EXISTS: the materialization cost is the same as above, but
-- echo there are much fewer outer rows to be looked up, thus the
-- echo materialization cost is too high to compensate for fast lookups.
EXPLAIN
SELECT Name FROM Country
WHERE (Code IN (select Country from City where City.Population > 100000) OR
       Name LIKE 'L%') AND
      surfacearea > 10*1000000;

SELECT Name FROM Country
WHERE (Code IN (select Country from City where City.Population > 100000) OR
       Name LIKE 'L%') AND
      surfacearea > 10*1000000;

-- echo
-- echo Q1.2m:
-- echo MATERIALIZATION: the IN predicate is pushed (attached) to the last table
-- echo in the join order (Country, City), therefore there are too many row
-- echo combinations to filter by re-executing the subquery for each combination.
EXPLAIN
SELECT *
  FROM Country, City
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 3000 AND Country.SurfaceArea > 10 AND
        (City.Name IN
         (select Language from CountryLanguage where Percentage > 50) OR
         City.name LIKE '%Island%');

SELECT *
  FROM Country, City
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 3000 AND Country.SurfaceArea > 10 AND
        (City.Name IN
         (select Language from CountryLanguage where Percentage > 50) OR
         City.name LIKE '%Island%');

-- echo Q1.2e:
-- echo IN_EXISTS: join order is the same, but the left IN operand refers to
-- echo only the first table in the join order (Country), so there are much
-- echo fewer rows to filter by subquery re-execution.
EXPLAIN
SELECT *
  FROM Country, City
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 3000 AND Country.SurfaceArea > 10 AND
        (Country.Name IN
         (select Language from CountryLanguage where Percentage > 50) OR
         Country.name LIKE '%Island%');

SELECT *
  FROM Country, City
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 3000 AND Country.SurfaceArea > 10 AND
        (Country.Name IN
         (select Language from CountryLanguage where Percentage > 50) OR
         Country.name LIKE '%Island%');


-- echo
-- echo Q1.3:
-- echo For the same reasons as in Q2 IN-EXISTS and MATERIALIZATION chosen
-- echo for each respective subquery.
EXPLAIN
SELECT City.Name, Country.Name
  FROM City,Country
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 30000 AND Country.SurfaceArea > 10 AND
        ((Country.Code, Country.Name) IN
         (select Country, Language from CountryLanguage where Percentage > 50) AND
         Country.Population > 3000000
         OR
         (Country.Code, City.Name) IN
         (select Country, Language from CountryLanguage));

SELECT City.Name, Country.Name
  FROM City,Country
  WHERE City.Country = Country.Code AND
        Country.SurfaceArea < 30000 AND Country.SurfaceArea > 10 AND
        ((Country.Code, Country.Name) IN
         (select Country, Language from CountryLanguage where Percentage > 50) AND
         Country.Population > 3000000
         OR
         (Country.Code, City.Name) IN
         (select Country, Language from CountryLanguage));


-- echo
-- echo 2. NOT IN subqueries
-- echo

-- echo
-- echo Q2.1:
-- echo Number of cities that are not capitals in countries with small population.
-- echo MATERIALIZATION is 50 times faster because the cost of each subquery
-- echo re-execution is much higher than the cost of index lookups into the
-- echo materialized subquery.

EXPLAIN
select count(*) from City
where City.id not in (select capital from Country
                      where capital is not null and population < 100000);

-- echo
-- echo Q2.2e:
-- echo Countries that speak French, but do not speak English
-- echo IN-EXISTS because the outer query filters many rows, thus
-- echo there are few lookups to make.
EXPLAIN
SELECT Country.Name
FROM Country, CountryLanguage
WHERE Code NOT IN (SELECT Country FROM CountryLanguage WHERE Language = 'English')
  AND CountryLanguage.Language = 'French'
  AND Code = Country;

SELECT Country.Name
FROM Country, CountryLanguage
WHERE Code NOT IN (SELECT Country FROM CountryLanguage WHERE Language = 'English')
  AND CountryLanguage.Language = 'French'
  AND Code = Country;

-- echo Q2.2m:
-- echo Countries that speak French OR Spanish, but do not speak English
-- echo MATERIALIZATION because the outer query filters less rows than Q5-a,
-- echo so there are more lookups.


set statement optimizer_switch='rowid_filter=off' for
EXPLAIN
SELECT Country.Name
FROM Country, CountryLanguage
WHERE Code NOT IN (SELECT Country FROM CountryLanguage WHERE Language = 'English')
  AND (CountryLanguage.Language = 'French' OR CountryLanguage.Language = 'Spanish')
  AND Code = Country;

set statement optimizer_switch='rowid_filter=off' for
SELECT Country.Name
FROM Country, CountryLanguage
WHERE Code NOT IN (SELECT Country FROM CountryLanguage WHERE Language = 'English')
  AND (CountryLanguage.Language = 'French' OR CountryLanguage.Language = 'Spanish')
  AND Code = Country;

-- echo
-- echo Q2.3e:
-- echo Not a very meaningful query that tests NOT IN.
-- echo IN-EXISTS because the outer query is cheap enough to reexecute many times.
EXPLAIN
select count(*)
from CountryLanguage
where (Language, Country) NOT IN
      (SELECT City.Name, Country.Code
       FROM City LEFT JOIN Country ON (Country = Code and City.Population < 10000))
      AND Language IN ('English','Spanish');

select count(*)
from CountryLanguage
where (Language, Country) NOT IN
      (SELECT City.Name, Country.Code
       FROM City LEFT JOIN Country ON (Country = Code and City.Population < 10000))
      AND Language IN ('English','Spanish');

-- echo Q2.3m:
-- echo MATERIALIZATION with the PARTIAL_MATCH_MERGE strategy, because the HAVING
-- echo clause prevents the use of the index on City(Name), and in practice reduces
-- echo radically the size of the temp table.
EXPLAIN
select count(*)
from CountryLanguage
where (Language, Country) NOT IN
      (SELECT City.Name, Country.Code
       FROM City LEFT JOIN Country ON (Country = Code)
       HAVING City.Name LIKE "Santa%");

select count(*)
from CountryLanguage
where (Language, Country) NOT IN
      (SELECT City.Name, Country.Code
       FROM City LEFT JOIN Country ON (Country = Code)
       HAVING City.Name LIKE "Santa%");


-- echo
-- echo 3. Subqueries with GROUP BY, HAVING, and aggregate functions
-- echo

-- echo Q3.1:
-- echo Languages that are spoken in countries with 10 or 11 languages
-- echo MATERIALIZATION is about 100 times faster than IN-EXISTS.

EXPLAIN
select count(*)
from CountryLanguage
where
(Country, 10) IN (SELECT Code, COUNT(*) FROM CountryLanguage, Country
                  WHERE Code = Country GROUP BY Code)
OR
(Country, 11) IN (SELECT Code, COUNT(*) FROM CountryLanguage, Country
                  WHERE Code = Country GROUP BY Code)
order by Country;

select count(*)
from CountryLanguage
where
(Country, 10) IN (SELECT Code, COUNT(*) FROM CountryLanguage, Country
                  WHERE Code = Country GROUP BY Code)
OR
(Country, 11) IN (SELECT Code, COUNT(*) FROM CountryLanguage, Country
                  WHERE Code = Country GROUP BY Code)
order by Country;


-- echo
-- echo Q3.2:
-- echo Countries whose capital is a city name that names more than one
-- echo cities.
-- echo MATERIALIZATION because the cost of single subquery execution is
-- echo close to that of materializing the subquery.

EXPLAIN
select * from Country, City
where capital = id and
      (City.name in (SELECT name FROM City
                     GROUP BY name HAVING Count(*) > 2) OR
       capital is null);

select * from Country, City
where capital = id and
      (City.name in (SELECT name FROM City
                     GROUP BY name HAVING Count(*) > 2) OR
       capital is null);

-- echo
-- echo Q3.3: MATERIALIZATION is 25 times faster than IN-EXISTS

EXPLAIN
SELECT Name
FROM Country
WHERE Country.Code NOT IN
      (SELECT Country FROM City GROUP BY Name HAVING COUNT(Name) = 1);

SELECT Name
FROM Country
WHERE Country.Code NOT IN
      (SELECT Country FROM City GROUP BY Name HAVING COUNT(Name) = 1);


-- echo
-- echo 4. Subqueries in the SELECT and HAVING clauses
-- echo

-- echo Q4.1m:
-- echo Capital information about very big cities
-- echo MATERIALIZATION
EXPLAIN
select Name, City.id in (select capital from Country where capital is not null) as is_capital
from City
where City.population > 10000000;

select Name, City.id in (select capital from Country where capital is not null) as is_capital
from City
where City.population > 10000000;

-- echo Q4.1e:
-- echo IN-TO-EXISTS after adding an index to make the subquery re-execution
-- echo efficient.

create index CountryCapital on Country(capital);

EXPLAIN
select Name, City.id in (select capital from Country where capital is not null) as is_capital
from City
where City.population > 10000000;

select Name, City.id in (select capital from Country where capital is not null) as is_capital
from City
where City.population > 10000000;

drop index CountryCapital on Country;

-- echo
-- echo Q4.2:
-- echo MATERIALIZATION
# TODO: the cost estimates for subqueries in the HAVING clause need to be changed
# to take into account that the subquery predicate is executed #times ~ to the
# number of groups, not number of rows
EXPLAIN
SELECT City.Name, City.Population
FROM City JOIN Country ON City.Country = Country.Code
GROUP BY City.Name
HAVING City.Name IN (select Name from Country where population < 1000000);

SELECT City.Name, City.Population
FROM City JOIN Country ON City.Country = Country.Code
GROUP BY City.Name
HAVING City.Name IN (select Name from Country where population < 1000000);


-- echo
-- echo 5. Subqueries with UNION
-- echo

-- echo Q5.1:
EXPLAIN
SELECT * from City where (Name, 91) in
(SELECT Name, round(Population/1000)
 FROM City
 WHERE Country = "IND" AND Population > 2500000
UNION
 SELECT Name, round(Population/1000)
 FROM City
 WHERE Country = "IND" AND Population < 100000);

SELECT * from City where (Name, 91) in
(SELECT Name, round(Population/1000)
 FROM City
 WHERE Country = "IND" AND Population > 2500000
UNION
 SELECT Name, round(Population/1000)
 FROM City
 WHERE Country = "IND" AND Population < 100000);

set @@optimizer_switch='default';
drop database world;
-- echo


-- echo
-- echo TEST GROUP 2:
-- echo Tests of various combinations of optimizer switches, types of queries,
-- echo available indexes, column nullability, constness of tables/predicates.
-- echo =====================================================================


#TODO From Igor's review:
#
#2.1 Please add a case when two subqueries  are used in the where clause
#(or in select) of a 2-way join.
#The first subquery is accessed after the first table, while the second
#is accessed after the second table.
#
#2.2. Please add a test case when one non-correlated subquery contains
#another non-correlated subquery.
#Consider 4 subcases:
#   both subqueries are materialized
#   IN_EXIST transformations are applied to both subqueries
#   outer subquery is materialized while the inner subquery  is not
#(IN_EXIST transformation is applied to it)
#   inner subqyery is materialized while the outer subquery  is not (
#IN_EXIST transformation is applied to it)

set optimizer_switch=@subselect_mat_cost;