sql优化

-- Table "refgene" DDLCREATE TABLE `refgene` (
  `bin` smallint(5) unsigned NOT NULL DEFAULT '0',
  `name` varchar(255) NOT NULL DEFAULT '',
  `chrom` varchar(255) NOT NULL DEFAULT '',
  `strand` char(1) NOT NULL DEFAULT '',
  `txStart` int(11) unsigned NOT NULL DEFAULT '0',
  `txEnd` int(11) unsigned NOT NULL DEFAULT '0',
  `cdsStart` int(11) unsigned NOT NULL DEFAULT '0',
  `cdsEnd` int(11) unsigned NOT NULL DEFAULT '0',
  `exonCount` int(10) unsigned NOT NULL DEFAULT '0',
  `exonStarts` longblob NOT NULL,
  `exonEnds` longblob NOT NULL,
  `id` int(10) unsigned NOT NULL DEFAULT '0',
  `name2` varchar(255) NOT NULL DEFAULT '',
  `cdsStartStat` enum('none','unk','incmpl','cmpl') NOT NULL DEFAULT 'none',
  `cdsEndStat` enum('none','unk','incmpl','cmpl') NOT NULL DEFAULT 'none',
  `exonFrames` longblob NOT NULL,
  KEY `chrom` (`chrom`(7),`bin`),
  KEY `name2` (`name2`(10)),
  KEY `name` (`name`(12)),
  KEY `chrom_2` (`chrom`(7),`txStart`),
  KEY `chrom_3` (`chrom`(7),`txEnd`),
  KEY `ix_all` (`chrom`,`strand`,`txStart`,`txEnd`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
-- Table "g20100806_wjh_h_" DDLCREATE TABLE `g20100806_wjh_h_` (
  `chromosome` varchar(255) DEFAULT NULL,
  `Chip` varchar(255) DEFAULT NULL,
  `start` varchar(255) DEFAULT NULL,
  `stop` varchar(255) DEFAULT NULL,
  `probeids` varchar(255) DEFAULT NULL,
  `seq_id` varchar(255) DEFAULT NULL,
  `count` varchar(255) DEFAULT NULL,
  `ratioH_1` varchar(255) DEFAULT NULL,
  `ratioH_2` varchar(255) DEFAULT NULL,
  `ratioH_3` varchar(255) DEFAULT NULL,
  KEY `ix_all` (`chromosome`,`start`,`stop`),
  KEY `ix_chr` (`chromosome`),
  KEY `ix_start` (`start`),
  KEY `ix_stop` (`stop`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
我想通过这两张表得到第三张表。如下：
//时间和空间复杂度都比较高，不予考虑
delimiter //
create procedure p_create_temp3(length int)
begin
create table temp3
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and ((strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart)))
  or
   (strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length))));
end
delimiter ;
call p_create_temp3(20);
请问：如何提高其效率，表g20100806_wjh_h_的索引用不上。又或者有没有什么其他的方法可以代替呢？谢谢

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

show index from refgene;
show index from g20100806_wjh_h_;
explain select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome
  and (
   (strand='+'
   and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart))
   )
or
(strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length)))
  );
贴出三句的结果以供分析。
oot@localhost : tss Sun Sep 26 12:18:55 2010>show index from refgene;
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table   | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| refgene |          1 | chrom    |            1 | chrom       | A         |          46 |        7 | NULL   |      | BTREE      |         |
| refgene |          1 | chrom    |            2 | bin         | A         |       12013 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | name2    |            1 | name2       | A         |       18020 |       10 | NULL   |      | BTREE      |         |
| refgene |          1 | name     |            1 | name        | A         |       36040 |       12 | NULL   |      | BTREE      |         |
| refgene |          1 | chrom_2  |            1 | chrom       | A         |          46 |        7 | NULL   |      | BTREE      |         |
| refgene |          1 | chrom_2  |            2 | txStart     | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | chrom_3  |            1 | chrom       | A         |          46 |        7 | NULL   |      | BTREE      |         |
| refgene |          1 | chrom_3  |            2 | txEnd       | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_all   |            1 | chrom       | A         |          46 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_all   |            2 | strand      | A         |          83 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_all   |            3 | txStart     | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_all   |            4 | txEnd       | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
12 rows in set (0.05 sec)oot@localhost : tss Sun Sep 26 12:20:01 2010>show index from g20100806_wjh_h_;
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table            | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| g20100806_wjh_h_ |          1 | ix_all   |            1 | chromosome  | A         |          42 |     NULL | NULL   | YES  | BTREE      |         |
| g20100806_wjh_h_ |          1 | ix_all   |            2 | start       | A         |      385019 |     NULL | NULL   | YES  | BTREE      |         |
| g20100806_wjh_h_ |          1 | ix_all   |            3 | stop        | A         |      385019 |     NULL | NULL   | YES  | BTREE      |         |
| g20100806_wjh_h_ |          1 | ix_chr   |            1 | chromosome  | A         |          42 |     NULL | NULL   | YES  | BTREE      |         |
| g20100806_wjh_h_ |          1 | ix_start |            1 | start       | A         |      385019 |     NULL | NULL   | YES  | BTREE      |         |
| g20100806_wjh_h_ |          1 | ix_stop  |            1 | stop        | A         |      385019 |     NULL | NULL   | YES  | BTREE      |         |
+------------------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
6 rows in set (0.00 sec)
oot@localhost : tss Sun Sep 26 09:38:19 2010>explain select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
    ->   where refGene.chrom=g20100806_wjh_h_.chromosome and  ((txstart>=start+20 and txstart<=stop+20) or (txstart<=start+20 and txstart>=stop) or (txstart<=start+20 and txstart<=stop)) and strand='+' \G;
*************************** 1. row ***************************
           id: 1
  select_type: SIMPLE
        table: g20100806_wjh_h_
         type: ALL
possible_keys: ix_all,ix_chr,ix_stop
          key: NULL
      key_len: NULL
          ref: NULL
         rows: 385019
        Extra:
*************************** 2. row ***************************
           id: 1
  select_type: SIMPLE
        table: refgene
         type: ref
possible_keys: chrom,chrom_2,chrom_3,ix_all
          key: ix_all
      key_len: 258
          ref: tss.g20100806_wjh_h_.chromosome,const
         rows: 434
        Extra: Using where
2 rows in set (0.00 sec)大概就是这样
语句本身很复杂，嵌套了两个 OR （  AND OR）你想实现的功能是什么，看看能不能先简化一下语句。
delimiter //
create procedure p_create_temp3(length int)
begin

select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart))
  into outfile 'd:/temp3_positive.txt';

  select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start>=txend and stop>=txend+length))
  into outfile 'd:/temp3_negative.txt';
end
delimiter ;
call p_create_temp3(1200);逻辑只能简化到这了。
这样再将文件导入到表中。
可是就单个sql语句也很复杂，大表的索引用不上，速度超慢的
描述一下通过这个SQL语句你想实现的功能，或者整个存储过程想实现的功能。这样有助于别人理解。
如果strand=‘+’，则start，stop在一个特定的区间内，这个区间和length有关
如果strand=‘-’，则start，stop也在令一个特定的区间，这个区间也和length有关
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where refGene.chrom=g20100806_wjh_h_.chromosome
and
(
(strand='+' and
(
(start<=txstart-length and stop>=txstart-length)
or
(start>=txstart-length and stop<=txstart)
or
(start>=txstart-length and stop>=txstart)
)
)
or
(strand='-' and
(
(start<=txend and stop>=txend)
or
(start>=txend and stop<=txend+length)
or
(start>=txend and stop>=txend+length)
)
)
);你的语句的逻辑很奇怪！
比如下面的。 (strand='+' and
(
(start<=txstart-length and stop>=txstart-length)
or
(start>=txstart-length and stop<=txstart)
or
(start>=txstart-length and stop>=txstart)
)
)这两句红色部分
(start>=txstart-length and stop<=txstart) or (start>=txstart-length and stop>=txstart)
就等同于start>=txstart-length and ( stop<=txstart or  stop>=txstart)
也就是 start>=txstart-length and TRUE
建议你能用语言描述一下实际的业务逻辑。
创建这两个复合索引
refGene(chrom,strand,txstart)
refGene(chrom,strand,txend)然后运行如下语句。
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and start+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and start>=txend;
union all不会出现重复么？
我重新整理了一下逻辑：
delimiter //
create procedure p_create_temp3(length int)
begin

select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start<=txstart and stop>=txstart))
  into outfile 'd:/temp3_positive.txt';

  select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start<=txend+length and stop>=txend+length))
  into outfile 'd:/temp3_negative.txt';

  create table t_3 select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_ limit 0;

  load data infile 'd:/temp3_positive.txt' into table t_3 fields terminated by '\t'  escaped by '\\' lines terminated by '\n';
  load data infile 'd:/temp3_negative.txt' into table t_3 fields terminated by '\t'  escaped by '\\' lines terminated by '\n';              end
delimiter ;
call p_create_temp3(1200);
这样写的话，索引应该怎样建比较好呢？
这样写是为了能用上联合索引吧，可是为什么用union all而不用union呢？
原来我的写法为什么用不上索引呢？我也建了联合索引啊
建议说明一下。原来我的写法为什么用不上索引呢？我也建了联合索引啊
refGene(chrom,strand,txstart)
refGene(chrom,strand,txend)
这两个索引你建了吗？这样写是为了能用上联合索引吧，可是为什么用union all而不用union呢？
如果担心重复则应该用UNION。
由于你始终不愿意花时间描述你的业务逻辑，所以上面的优化都是根据你的SQL语句实现的。其实如果能所设 stop >= start 的话，你的语句就可以更简单！select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
首先，我得先谢谢你，我不是不描述，我是觉得描述起来非常的复杂
stop是永远大于start
原来的逻辑有点问题，我改正过来了麻烦你再重新看一下我的sql语句。
delimiter //
create procedure p_create_temp3(length int)
beginselect chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and ((start<=txstart-length and stop>=txstart-length) or (start>=txstart-length and stop<=txstart) or (start<=txstart and stop>=txstart))
  into outfile 'd:/temp3_positive.txt';

  select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='-' and ((start<=txend and stop>=txend) or (start>=txend and stop<=txend+length) or (start<=txend+length and stop>=txend+length))
  into outfile 'd:/temp3_negative.txt';

  create table t_3 select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_ limit 0;

  load data infile 'd:/temp3_positive.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n';
  load data infile 'd:/temp3_negative.txt' into table t_3 fields terminated by '\t' escaped by '\\' lines terminated by '\n';   end
delimiter ;
call p_create_temp3(1200);
这样写的话，索引应该怎样建比较好呢？
我觉得你的
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='+' and stop+length>=txstart
union all
select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend
from refgene,g20100806_wjh_h_
where chrom=chromosome
and strand='-' and stop>=txend
应该和我的逻辑不一样吧？？？？？？
root@localhost : tss Sun Sep 26 16:33:17 2010>show index from refgene;
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| Table   | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
| refgene |          1 | ix_start |            1 | chrom       | A         |          46 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_start |            2 | strand      | A         |          83 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_start |            3 | txStart     | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_end   |            1 | chrom       | A         |          46 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_end   |            2 | strand      | A         |          83 |     NULL | NULL   |      | BTREE      |         |
| refgene |          1 | ix_end   |            3 | txEnd       | A         |       36040 |     NULL | NULL   |      | BTREE      |         |
+---------+------------+----------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+
6 rows in set (0.00 sec)
root@localhost : tss Sun Sep 26 15:45:03 2010>explain
    -> select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
    ->   where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20<=txstart and stop+20>=txstart
    ->   union
    ->   select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
    ->   where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20>=txstart and stop<=txstart
    ->   union
    ->   select chrom,start,stop,name,name2,txstart as tss,strand,seq_id,probeids,txstart,txend,cdsstart,cdsend from refgene,g20100806_wjh_h_
    ->   where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start<=txstart and stop>=txstart \G;
*************************** 1. row ***************************
           id: 1
  select_type: PRIMARY
        table: g20100806_wjh_h_
         type: ALL
possible_keys: ix_all,ix_chr
          key: NULL
      key_len: NULL
          ref: NULL
         rows: 385019
        Extra:
*************************** 2. row ***************************
           id: 1
  select_type: PRIMARY
        table: refgene
         type: ref
possible_keys: ix_start,ix_end
          key: ix_start
      key_len: 258
          ref: tss.g20100806_wjh_h_.chromosome,const
         rows: 434
        Extra: Using where
*************************** 3. row ***************************
           id: 2
  select_type: UNION
        table: g20100806_wjh_h_
         type: ALL
possible_keys: ix_all,ix_chr,ix_stop
          key: NULL
      key_len: NULL
          ref: NULL
         rows: 385019
        Extra:
*************************** 4. row ***************************
           id: 2
  select_type: UNION
        table: refgene
         type: ref
possible_keys: ix_start,ix_end
          key: ix_start
      key_len: 258
          ref: tss.g20100806_wjh_h_.chromosome,const
         rows: 434
        Extra: Using where
*************************** 5. row ***************************
           id: 3
  select_type: UNION
        table: g20100806_wjh_h_
         type: ALL
possible_keys: ix_all,ix_chr,ix_start,ix_stop
          key: NULL
      key_len: NULL
          ref: NULL
         rows: 385019
        Extra:
*************************** 6. row ***************************
           id: 3
  select_type: UNION
        table: refgene
         type: ref
possible_keys: ix_start,ix_end
          key: ix_start
      key_len: 258
          ref: tss.g20100806_wjh_h_.chromosome,const
         rows: 434
        Extra: Using where
*************************** 7. row ***************************
           id: NULL
  select_type: UNION RESULT
        table: <union1,2,3>
         type: ALL
possible_keys: NULL
          key: NULL
      key_len: NULL
          ref: NULL
         rows: NULL
        Extra:
7 rows in set (0.00 sec)我按照你的方法建了索引，调整了sql语句，结果如上，有一张表是全文扫描rows: 385019
搞不懂连接表是如何利用索引的？？
因为你的查询条件，对g20100806_wjh_h_来说，只能使用全表扫描，能加速的只是后面的refgene表部分。
呵呵，真是麻烦你了。。
where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20<=txstart and stop+20>=txstart  where refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start+20>=txstart and stop<=txstartwhere refGene.chrom=g20100806_wjh_h_.chromosome and strand='+' and start<=txstart and stop>=txstart
就这三个条件，将满足这三个条件的结果集合并（union）这是对于strand为‘+’的情况
其实说白了，就是chrom相等，strand为‘+’，start与stop表示一个区间，这个区间只要和[txstart-length,txend]有相交的地方，就认为是我所需要的结果对于strand为‘-’情况也一样。不同的是[start,stop]与区间[txend,txend+length]相交的地方就是我想要的。
您看我这样描述你清楚了么？
chrom相等，strand='+',如果【start,stop】与区间【txstart-length,txend】相交，则该记录就是我们需要的。chrom相等，strand='-',如果【start,stop】与区间【txend,txend+length】相交，则该记录就是我们需要的。
前面的很多where，or，union都是用来判断是否相交用的。
楼主能认真对待一下你自己进出的需求吗？！虽然从CSDN可以得到免费的帮助，但也不是用来浪费的啊。
我先陈述一点，我并没有不认真啊。。(如果你从我的描述中觉得我不认真，那么我只能再次道一声歉)
两张表中的关联字段：
refGene.chrom=g20100806_wjh_h_.chromosome 这是连接的条件
strand='+/-'有两种情况
为‘+’时，将g20100806_wjh_h_表中的每一条记录去和refgene比较，拿如果如果【start,stop】与区间【txstart-length,txend】相交，则该条记录就满足需求，并将其插入到我生成的新表中。
为'-',将g20100806_wjh_h_表中的每一条记录去和refgene比较，如果【start,stop】与区间【txend,txend+length】相交，则该记录就是我们需要的。
(其中start永远小于stop，txstart永远小于txend)
不知道这样你是否清楚？
chrom相等，strand='+',如果【start,stop】与区间【txstart-length,txend】相交，则该记录就是我们需要的。
chrom相等，strand='-',如果【start,stop】与区间【txend,txend+length】相交，则该记录就是我们需要的。请确认红字部分没有错。
如果错了，就结贴重开再问吧。