【11】把 Elasticsearch 当数据库使:Filter 下钻

1291 查看

使用 https://github.com/taowen/es-monitor 可以用 SQL 进行 elasticsearch 的查询。下钻未必一定是GROUP BY,每记过一层GROUP BY,桶的数量就会增加一层。还有一种下钻是用 filter 来下钻,这种下钻方式会使下一层的 match的文档数量变少,但是桶的数量不变。

SELECT INSIDE WHERE ipo_year=2000

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000)
EOF
{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0}

可以看到桶的数量还是同一个,只是增加了一个max_at_2000结果字段。
Elasticsearch

{
  "aggs": {
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 3, 
  "aggregations": {
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.3492430000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 220149,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 95037,
          "advance": 34057
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1883710000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3980,
          "next_doc": 169730,
          "match": 0,
          "build_scorer": 14661,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 3158,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "8.789594000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2768050000ms"
          },
          {
            "name": "BucketCollector: [[year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "7.667765000ms"
          }
        ]
      }
    ]
  }
]

从Profile的结果来看,其实是一次性做了两个查询,一个是全部文档,一个是ipo_year=2000的文档,然后统一聚合。

SELECT INSIDE 往不同方向过滤下钻

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH all_symbols AS (SELECT MAX(market_cap) AS max_all_times FROM symbol);
    WITH year_2000 AS (SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000);
    WITH year_2001 AS (SELECT MAX(market_cap) AS max_at_2001 FROM all_symbols 
        WHERE ipo_year=2001)
EOF

这个写法其实和 CASE WHEN 很类似,但是其表达能力更强大,更灵活。

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    SELECT per_ipo_year, MAX(market_cap) AS max_all_times FROM symbol 
        GROUP BY CASE 
            WHEN ipo_year=2000 THEN 'year_2000' 
            WHEN ipo_year=2001 THEN 'year_2001' 
        END AS per_ipo_year
EOF

上面的sql查询结果是

{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001": 8762940000.0}

Elasticsearch

{
  "aggs": {
    "year_2001": {
      "filter": {
        "term": {
          "ipo_year": 2001
        }
      }, 
      "aggs": {
        "max_at_2001": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "year_2001": {
      "max_at_2001": {
        "value": 8762940000.0
      }, 
      "doc_count": 38
    }, 
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profile

 [
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`Q",
        "time": "0.2518270000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 186032,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 48664,
          "advance": 17131
        }
      },
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.1200760000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 77254,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 25184,
          "advance": 17638
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1968800000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3573,
          "next_doc": 180136,
          "match": 0,
          "build_scorer": 13171,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 4250,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.459413000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2160950000ms"
          },
          {
            "name": "BucketCollector: [[year_2001, year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "1.455703000ms"
          }
        ]
      }
    ]
  }
]

SELECT INSIDE 往同一方向连续过滤下钻

SQL

$ cat << EOF | ./es_query.py http://127.0.0.1:9200
    WITH SELECT MAX(market_cap) AS max_all_times FROM symbol AS all_symbols;
    WITH SELECT MAX(market_cap) AS max_at_2000 FROM all_symbols 
        WHERE ipo_year=2000 AS year_2000;
    WITH SELECT MAX(market_cap) AS max_at_2001_finance FROM year_2000 
        WHERE sector='Finance' AS year_2000_finance
EOF
{"max_at_2000": 20310000000.0, "max_all_times": 522690000000.0, "max_at_2001_finance": 985668354.0}

Elasticsearch

{
  "aggs": {
    "year_2000": {
      "filter": {
        "term": {
          "ipo_year": 2000
        }
      }, 
      "aggs": {
        "max_at_2000": {
          "max": {
            "field": "market_cap"
          }
        }, 
        "year_2000_finance": {
          "filter": {
            "term": {
              "sector": "Finance"
            }
          }, 
          "aggs": {
            "max_at_2001_finance": {
              "max": {
                "field": "market_cap"
              }
            }
          }
        }
      }
    }, 
    "max_all_times": {
      "max": {
        "field": "market_cap"
      }
    }
  }, 
  "size": 0
}
{
  "hits": {
    "hits": [], 
    "total": 6714, 
    "max_score": 0.0
  }, 
  "_shards": {
    "successful": 1, 
    "failed": 0, 
    "total": 1
  }, 
  "took": 2, 
  "aggregations": {
    "year_2000": {
      "max_at_2000": {
        "value": 20310000000.0
      }, 
      "year_2000_finance": {
        "max_at_2001_finance": {
          "value": 985668354.0
        }, 
        "doc_count": 2
      }, 
      "doc_count": 58
    }, 
    "max_all_times": {
      "value": 522690000000.0
    }
  }, 
  "timed_out": false
}

Profile

[
  {
    "query": [
      {
        "query_type": "TermQuery",
        "lucene": "ipo_year:`P",
        "time": "0.1897790000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 145762,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 26216,
          "advance": 17801
        }
      },
      {
        "query_type": "TermQuery",
        "lucene": "sector:Finance",
        "time": "0.2380290000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 57770,
          "next_doc": 0,
          "match": 0,
          "build_scorer": 55497,
          "advance": 124762
        }
      },
      {
        "query_type": "MatchAllDocsQuery",
        "lucene": "*:*",
        "time": "0.1965630000ms",
        "breakdown": {
          "score": 0,
          "create_weight": 3500,
          "next_doc": 178347,
          "match": 0,
          "build_scorer": 14716,
          "advance": 0
        }
      }
    ],
    "rewrite_time": 4190,
    "collector": [
      {
        "name": "MultiCollector",
        "reason": "search_multi",
        "time": "2.466917000ms",
        "children": [
          {
            "name": "TotalHitCountCollector",
            "reason": "search_count",
            "time": "0.2712430000ms"
          },
          {
            "name": "BucketCollector: [[year_2000, max_all_times]]",
            "reason": "aggregation",
            "time": "1.370663000ms"
          }
        ]
      }
    ]
  }
]

有了 GROUP BY 下钻和 FILTER 下钻,很多复杂的查询可以一条就查询出来。而这种边下钻边聚合指标的查询能力甚至是传统SQL都不具备的。而且稍微训练一下,就会非常习惯这种下钻的思维方式,写查询也会很自然。