Command: Query Time: 184551 State: Sending data Info: select item_id, sum(sold) as sold from stat_item_detail where item_id in ( select item_id from stat_item_detail where gmt_create >= '2019-10-05 08:59:00') group by item_id
create table stat_item_detail( id int not null auto_increment, item_id int not null, sold int not null, gmt_create datetime not null, padding varchar(4000), primary key(id), key idx_item_id(item_id), key idx_gmt_create(gmt_create) ) engine=innodb;
create view digit as select 0 as a union all select 1 union all select 2 union all select 3 union all select 4 union all select 5 union all select 6 union all select 7 union all select 8 union all select 9 ;
create view numbers_1m AS select ((((a.a * 10 + b.a)*10 + c.a)*10 + d.a)*10+e.a)*10+f.a as n from digit a, digit b, digit c, digit d, digit e, digit f;
insert into stat_item_detail(item_id, sold, gmt_create, padding) select n + 1000000 - n % 2 as item_id, n % 100 - n%100%2, date_add('2024-06-01 00:00:00', interval n minute) as gmt_create, rpad('x', 1000, 'abcdefg ') as padding from numbers_1m;
当时用的还是MySQL 5.1和5.5的版本。我们先来看一下在5.5中这个SQL的执行计划。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
mysql> explain select item_id, sum(sold) as sold from stat_item_detail where item_id in ( select item_id from stat_item_detail where Gmt_create >= '2026-04-26 10:30:00') group by item_id;
+----+--------------------+------------------+----------------+----------------------------+-------------+---------+------+---------+-------------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | Extra | +----+--------------------+------------------+----------------+----------------------------+-------------+---------+------+---------+-------------+ | 1 | PRIMARY | stat_item_detail | index | NULL | idx_item_id | 4 | NULL | 1000029 | Using where | | 2 | DEPENDENT SUBQUERY | stat_item_detail | index_subquery | idx_item_id,idx_gmt_create | idx_item_id | 4 | func | 1 | Using where | +----+--------------------+------------------+----------------+----------------------------+-------------+---------+------+---------+-------------+
从上面的这个执行计划可以看到,这个SQL在执行时,先全量扫描索引idx_item_id,每得到一个item_id后,执行相关子查询(DEPENDENT SUBQUERY)select 1 from stat_item_detail where gmt_create >= ‘2026-04-26 10:30:00’ and item_id = primary.item_id。当主查询中表中的数据量很大的时候,子查询执行的次数也会很多,因此SQL的性能非常差。
在我的测试环境中,执行这个SQL需要45秒左右。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
mysql> select item_id, sum(sold) as sold from stat_item_detail where item_id in ( select item_id from stat_item_detail where Gmt_create >= '2026-04-26 10:30:00') group by item_id; +---------+------+ | item_id | sold | +---------+------+ | 1999990 | 180 | | 1999992 | 184 | | 1999994 | 188 | | 1999996 | 192 | | 1999998 | 196 | +---------+------+ 5 rows in set (44.64 sec)
那么将IN改成exists后,是否能提升性能呢?我们来试一下,可以看到执行时间和使用IN基本一样。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
mysql> select item_id, sum(sold) as sold from stat_item_detail t1 where exists ( select 1 from stat_item_detail where gmt_create >= '2026-04-26 10:30:00' and item_id = t1.item_id ) group by item_id;
mysql> select t1.item_id, sum(t1.sold) as sold from stat_item_detail t1, ( select distinct item_id from stat_item_detail t2 where t2.gmt_create >= '2026-04-26 10:30:00') t22 where t1.item_id = t22.item_id group by t1.item_id; +---------+------+ | item_id | sold | +---------+------+ | 1999990 | 180 | | 1999992 | 184 | | 1999994 | 188 | | 1999996 | 192 | | 1999998 | 196 | +---------+------+ 5 rows in set (0.00 sec)
实际上,我们还可以使用另外一种方法来去重,也就是按主表的主键字段来去重。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
select item_id, sum(sold) from ( select distinct t1.id, t1.item_id, t1.sold as sold from stat_item_detail t1, stat_item_detail t2 where t1.item_id = t2.item_id and t2.gmt_create >= '2026-04-26 10:30:00' ) t group by item_id;
MySQL 5.6开始引入了半连接转换,对于前面例子中的SQL,优化器会自动进行查询转换,不需要再手动改写SQL了。在MySQL 5.6和5.7中,还不会对exists做半连接优化。从MySQL 8.0.16开始,优化器对exists子查询也会进行半连接转换。
在8.0的环境中执行这个SQL,MySQL自动把查询转换成了半连接。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
mysql> explain select item_id, sum(sold) as sold from stat_item_detail where item_id in ( select item_id from stat_item_detail where Gmt_create >= '2026-04-26 10:30:00') group by item_id;
mysql> select t1.item_id, sum(t1.sold) as sold from stat_item_detail t1, ( select distinct item_id from stat_item_detail t2 where t2.gmt_create >= '2026-04-26 10:30:00') t22 where t1.item_id = t22.item_id group by t1.item_id;
CREATE TABLE t_parent ( id int not null auto_increment, a int, b int , c int , padding varchar(2000), primary key(id), KEY idx_a (a) ) ENGINE=InnoDB;
CREATE TABLE t_subq ( id int not null auto_increment, a int , b int , c int , d int , padding varchar(2000), primary key(id), UNIQUE KEY uk_cb (c,b), KEY idx_abc (a,b,c) ) ENGINE=InnoDB;
insert into t_parent(a,b,c) values (1,1,1),(2,2,2),(3,3,3),(4,4,4),(5,5,5),(null,0,0),(2,2,2);
insert into t_subq (a,b,c,d) values (1,1,1,1),(2,2,2,2),(3,3,3,3),(2,4,4,2);
mysql> show warnings\G *************************** 1. row *************************** Level: Note Code: 1003 Message: /* select#1 */ select `rep`.`t_parent`.`id` AS `id`,`rep`.`t_parent`.`a` AS `a`,`rep`.`t_parent`.`b` AS `b`,`rep`.`t_parent`.`c` AS `c`,`rep`.`t_parent`.`padding` AS `padding` from `rep`.`t_subq` join `rep`.`t_parent` where ((`rep`.`t_parent`.`a` = `rep`.`t_subq`.`b`) and (`rep`.`t_subq`.`c` = 1))
mysql> show warnings\G *************************** 1. row *************************** Level: Note Code: 1003 Message: /* select#1 */ select `rep`.`t_parent`.`id` AS `id`,`rep`.`t_parent`.`a` AS `a`,`rep`.`t_parent`.`b` AS `b`,`rep`.`t_parent`.`c` AS `c`,`rep`.`t_parent`.`padding` AS `padding` from `rep`.`t_parent` semi join (`rep`.`t_subq`) where ((`rep`.`t_parent`.`a` = `rep`.`t_subq`.`d`) and (`rep`.`t_subq`.`a` in (1,3))) 1 row in set (0.00 sec)
mysql> set optimizer_trace='enabled=on'; Query OK, 0 rows affected (0.00 sec)
mysql> explain select item_id, sum(sold) as sold from stat_item_detail where item_id in ( select item_id from stat_item_detail where gmt_create >= '2026-04-26 10:30:00') group by item_id;
mysql> explain select * from t_parent where b not in ( select b from t_subq where b is not null ); +----+--------------------+----------+-------+---------------+-------+---------+------+------+----------+--------------------------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+--------------------+----------+-------+---------------+-------+---------+------+------+----------+--------------------------+ | 1 | PRIMARY | t_parent | ALL | NULL | NULL | NULL | NULL | 7 | 100.00 | Using where | | 2 | DEPENDENT SUBQUERY | t_subq | index | NULL | uk_cb | 10 | NULL | 3 | 66.67 | Using where; Using index | +----+--------------------+----------+-------+---------------+-------+---------+------+------+----------+--------------------------+
mysql> explain select * from t_parent where not exists ( select 1 from t_subq where a=t_parent.a) and a is not null; +----+-------------+----------+-------+---------------+---------+---------+----------------+------+----------+--------------------------------------+ | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | +----+-------------+----------+-------+---------------+---------+---------+----------------+------+----------+--------------------------------------+ | 1 | SIMPLE | t_parent | range | idx_a | idx_a | 5 | NULL | 6 | 100.00 | Using index condition | | 1 | SIMPLE | t_subq | ref | idx_abc | idx_abc | 5 | rep.t_parent.a | 1 | 100.00 | Using where; Not exists; Using index | +----+-------------+----------+-------+---------------+---------+---------+----------------+------+----------+--------------------------------------+
mysql> show warnings\G *************************** 2. row *************************** Level: Note Code: 1003 Message: /* select#1 */ select `rep`.`t_parent`.`id` AS `id`, `rep`.`t_parent`.`a` AS `a`,`rep`.`t_parent`.`b` AS `b`, `rep`.`t_parent`.`c` AS `c`,`rep`.`t_parent`.`padding` AS `padding` from `rep`.`t_parent` anti join (`rep`.`t_subq`) on((`rep`.`t_subq`.`a` = `rep`.`t_parent`.`a`)) where (`rep`.`t_parent`.`a` is not null)
mysql> select * from t_parent where a not in ( select a from t_subq where a is not null); +----+------+------+------+---------+ | id | a | b | c | padding | +----+------+------+------+---------+ | 4 | 4 | 4 | 4 | NULL | | 5 | 5 | 5 | 5 | NULL | +----+------+------+------+---------+
create table emp_salary( id int not null auto_increment, emp_id int not null, dept_id int not null, salary int not null, padding varchar(2000), primary key(id), key idx_emp_id(emp_id), key idx_dept_id(dept_id) ) engine=innodb;
insert into emp_salary(emp_id, dept_id, salary, padding) select 100000 + n, n % 10, 10000 + (n * n) % 10000, rpad('A', 1000, 'ABCD') from numbers;
下面这个SQL,子查询中使用了聚合函数,优化器无法使用半连接转换。
1 2 3 4 5 6 7 8 9 10 11
mysql> explain select * from emp_salary t1 where salary > (select avg(salary) from emp_salary where dept_id = t1.dept_id)
mysql> explain select * from ( select t1.emp_id, t1.dept_id, t1.salary, (select avg(salary) from emp_salary where dept_id = t1.dept_id ) as dept_avg_salary from emp_salary t1 ) t where salary > dept_avg_salary;
mysql> select t1.* from emp_salary t1, ( select dept_id, avg(salary) as avg_salary from emp_salary group by dept_id ) t2 where t1.dept_id = t2.dept_id and t1.salary > t2.avg_salary;
MySQL 8.0增强了子查询的优化能力,对很多简单的子查询,优化器可以自动处理。如果你在子查询中使用了聚合函数,或者在select字段中使用了子查询,可能需要进行手动的优化。使用not in时,要注意子查询中不要出现null的数据,这会导致查询不到任何数据。
思考题
这一讲中,我提供了两种手动改写子查询的思路。
思路1:
1 2 3 4 5 6 7
mysql> select t1.item_id, sum(t1.sold) as sold from stat_item_detail t1, ( select distinct item_id from stat_item_detail t2 where t2.gmt_create >= '2026-04-26 10:30:00') t22 where t1.item_id = t22.item_id group by t1.item_id;
思路2:
1 2 3 4 5 6
select item_id, sum(sold) from ( select distinct t1.id, t1.item_id, t1.sold as sold from stat_item_detail t1, stat_item_detail t2 where t1.item_id = t2.item_id and t2.gmt_create >= '2026-04-26 10:30:00' ) t group by item_id;