聚合分析的格式:
"aggregations" : {
"<aggregation_name>" : {
"<aggregation_type>" : {
<aggregation_body>
}
[,"meta" : { [<meta_data_body>] } ]?
[,"aggregations" : { [<sub_aggregation>]+ } ]?
}
[,"<aggregation_name_2>" : { ... } ]*
}
举个栗子-------------------------------
GET test_index/doc/_search
{
"size":0
"aggs": { #聚合关键字
"avg_age": { #聚合的名字
"max": { #聚合分析的类型
"field": "age" #body
}}}}
聚合分析有四种:
metrics,指标分析聚合
bucket,分桶类型
pipeline,管道分析
matrix,矩阵分析
SELECT COUNT(color) FROM table GROUP BY color
# GROUP BY相当于做分桶(bucket)的工作,COUNT是统计指标(metrics)。
- Metrics
单值分析
-
min 返回数值类字段的最小值
GET test_index/_search
{
"size": 0,
"aggs": {
"min_age": {
"min": {
"field": "age"
}
}
}
}
--->
"aggregations": {
"min_age": {
"value": 10
}
} -
max
GET test_index/_search
{
"size": 0,
"aggs": {
"max_age": {
"max": {
"field": "age"
}
}
}
}
--------->
"aggregations": {
"max_age": {
"value": 50
}
} -
avg
GET test_index/_search
{
"size": 0,
"aggs": {
"avg_age": {
"avg": {
"field": "age"
}
}
}
}
------------->
"aggregations": {
"avg_age": {
"value": 24.666666666666668
}
} -
sum 求和
GET test_index/_search
{
"size": 0,
"aggs": {
"sum_age": {
"sum": {
"field": "age"
}
}
}
}
-------------->
"aggregations": {
"sum_age": {
"value": 148
}
} -
cardinality 基数,不同值的个数,类似sql里面的distinct count概念
GET test_index/_search
{
"size": 0,
"aggs": {
"cardinality_age": {
"cardinality": {
"field": "age"
}
}
}
}
----------->
"aggregations": {
"cardinality_age": {
"value": 5
}
} -
value_count 值计数
GET test_index/_search
{
"size": 0,
"aggs": {
"count_age": {
"value_count": {
"field": "name" #值计数
}
}
}
} -
一次可以多个聚合:
GET syslog-2018.07.13/_search
{
"size": 0,
"aggs": {
"min_facility": { #聚合名称
"min": { #聚合类型
"field": "facility"
}
},
"max_factility":{ #聚合名称
"max":{ #聚合类型
"field": "facility"
}
},
"avg_facility":{ #聚合名称
"avg": { #聚合类型
"field": "facility"
}
},
"sum_facility":{
"sum": {
"field": "facility"
}}}}
多值分析
-
stats 返回一系列数值类型的统计值。min,max,sum,count,avg
GET test_index/_search
{
"size": 0,
"aggs": {
"stats_age": {
"stats": {
"field": "age"
}}}}
#结果------------>
"aggregations": {
"stats_age": {
"count": 6,
"min": 10,
"max": 50,
"avg": 24.666666666666668,
"sum": 148
}} -
extended stats 在stats的基础上加了方差标准差等
GET test_index/_search
{
"size": 0,
"aggs": {
"age": {
"extended_stats": {
"field": "age"
}}}}
----------->
"aggregations": {
"age": {
"count": 6,
"min": 10,
"max": 50,
"avg": 24.666666666666668,
"sum": 148,
"sum_of_squares": 4560,
"variance": 151.55555555555557,
"std_deviation": 12.310790208412925,
"std_deviation_bounds": {
"upper": 49.28824708349252,
"lower": 0.04508624984081777
}}} -
percentiles 百分位数统计,了解数据分布情况
GET test_index/_search
{
"size": 0,
"aggs": {
"age": {
"percentiles": {
"field": "age",
"percents": [1,5,25,50,75,95,99] # 不加percents默认为[1,5,25,50,75,95,99]
}}}}
--------->
"aggregations": {
"age": {
"values": {
"1.0": 10,
"5.0": 10,
"25.0": 18,
"50.0": 23,
"75.0": 24,
"95.0": 50,
"99.0": 50
}}}计算第p百分位数的步骤:
第1步:以递增顺序排列原始数据(即从小到大排列)。
第2步:计算指数i=np% (n等于count,p%等于百分位)
第3步:
l)若 i 不是整数,将 i 向上取整。
2) 若i是整数,则第p百分位数是第i项与第(i+l)项数据的平均值。
-
percentiles ranks 字段的数值所占的百分位是多少
GET test_index/_search
{
"size": 0,
"aggs": {
"agg": {
"percentile_ranks": {
"field": "age",
"values": [24,50]
}}}}
----------->
"aggregations": {
"agg": {
"values": {
"24.0": 66.66666666666666,
"50.0": 100
}}} -
top hits 用于分桶后获取桶内最匹配的顶部文档
# 选项:from,size,sort
# 按照host分组,分组后取出每组里面时间最近的一条数据
GET syslog-2018.07.12/_search
{
"size": 0,
"aggs": {
"host": {
"terms": {
"field": "host"
},
"aggs": {
"sort_date": {
"top_hits": {
"size":1,
"sort": [
{
"@timestamp": {
"order":"desc"
}
}
],
"_source": {
"includes": ["host","message"]
}}}}}}}}
- Bucket 按照一定规则将文档分配到不同的桶里,分类分析
-
terms 每个唯一值一个桶,返回字段的值和值的个数doc_count。 如果是text类型,则按照分词后的结果分桶
GET test_index/_search
{
"size": 0,
"aggs": {
"agg": {
"terms": {
"field": "age",
"size": 5 #默认情况,返回按排序的前十个。可用size来更改。
}}}}
--------->
"buckets": [
{
"key": 24,
"doc_count": 2
},
{
"key": 10,
"doc_count": 1
},
{
"key": 18,
"doc_count": 1
},
{
"key": 22,
"doc_count": 1
},
{
"key": 50,
"doc_count": 1
}
] -
range 指定数值范围来设定分桶
GET test_index/_search
{
"size": 0,
"aggs": {
"agg": {
"range": {
"field": "age",
"ranges": [
{
"from": 20,
"to": 30
}]}}}}
-------------------->
"aggregations": {
"agg": {
"buckets": [
{
"key": "20.0-30.0",
"from": 20,
"to": 30,
"doc_count": 3
}]}} #示例2: 可以设定这里的key
GET syslog-2018.07.13/_search
{
"size": 0,
"aggs": {
"priority_range": {
"range": {
"field": "priority",
"ranges": [
{
"key": "<50",
"to": 50
},
{ "from": 50,
"to": 80
},
{
"key": ">80",
"from": 80
}]}}}}
------------------->
"aggregations": {
"priority_range": {
"buckets": [
{
"key": "<50",
"to": 50,
"doc_count": 1990
},
{
"key": "50.0-80.0",
"from": 50,
"to": 80,
"doc_count": 31674
},
{
"key": ">80",
"from": 80,
"doc_count": 5828
}
]
} -
date_range
#跟range的区别是 date range可以设置date match expression,+1h,-1d等。还可以指定返回字段的日期格式 format
GET syslog-2018.07*/_search
{
"size": 0,
"aggs": {
"timestamp_range":{
"date_range": {
"field": "@timestamp",
"format": "yyyy/MM/dd", #可以设置日期格式
"ranges": [
{
"from": "now-10d/d", #可以使用date match
"to": "now-5d/d"
},
{
"from": "now-5d/d"
}
]}}}}
------------------>
"aggregations": {
"timestamp_range": {
"buckets": [
{
"key": "2018/07/03-2018/07/08",
"from": 1530576000000,
"from_as_string": "2018/07/03",
"to": 1531008000000,
"to_as_string": "2018/07/08",
"doc_count": 739175
},
{
"key": "2018/07/08-*",
"from": 1531008000000,
"from_as_string": "2018/07/08",
"doc_count": 760635
}
]
}
} -
histogram 直方图
GET test_index/_search
{
"size": 0,
"aggs": {
"age": {
"histogram": { #关键字
"field": "age",
"interval": 10 #指定间隔大小
"extended_bounds": #指定数据范围
{
"min": 0,
"max": 50
}}}}}
#结果--------------->
"buckets": [
{
"key": 10,
"doc_count": 2
},
{
"key": 20,
"doc_count": 3
},
{
"key": 30,
"doc_count": 0
},
{
"key": 40,
"doc_count": 0
},
{
"key": 50,
"doc_count": 1
}] -
date_histogram 日期直方图
GET syslog-2018.07.1*/_search
{
"size": 0,
"aggs": {
"range": {
"date_histogram": {
"field": "@timestamp",
"format": "yyyy/MM/dd", #设置返回日期格式
"interval": "day" # 以年月日小时分钟为间隔
}}}}
--------------->
"aggregations": {
"range": {
"buckets": [
{
"key_as_string": "2018/07/10",
"key": 1531180800000,
"doc_count": 146354
},
{
"key_as_string": "2018/07/11",
"key": 1531267200000,
"doc_count": 143784
},
{
"key_as_string": "2018/07/12",
"key": 1531353600000,
"doc_count": 143137
},
{
"key_as_string": "2018/07/13",
"key": 1531440000000,
"doc_count": 43206
}
]
} -
filter 给聚合加过滤条件
GET test_index/_search
{
"size": 0,
"aggs": {
"salary": {
"filter": { #先过滤
"range": {
"salary": {
"gte": 8000
}
}
},
"aggs": {
"avg_age": { #后聚合
"avg": {
"field": "age"
}}}}}}
--------------------->
"aggregations": {
"salary": {
"doc_count": 4,
"avg_age": {
"value": 23.25
}
}
} - filters
GET /logs/_search
{
"aggs": {
"count_debug":{ #agg name
"filters": { #关键字
"filters": {
"error": { # 过滤器名字
"match":{ #查询语句关键字match
"body":"error" #匹配body字段中带有error的
}
},
"warnings":{ #过滤器名字
"term":{ #查询语句关键字term
"body":"warning" #匹配body字段中带有warning的
}}}}}}}
---------结果--------》
"buckets": {
"error": {
"doc_count": 1
},
"warnings": {
"doc_count": 2
}}-
nested 嵌套类型聚合
PUT test_index
{
"mappings": {
"doc": {
"properties": {
"man":{
"type": "nested", #设置man字段为nested类型
"properties": { #子字段
"age":{
"type":"integer"
},
"name":{
"type":"text"
}}}}}}}}
PUT test_index/doc/1
{
"man":[
{
"name":"alice white",
"age":34
},
{
"name":"peter brown",
"age":26
}
]
}
GET test_index/_search
{
"size": 0,
"aggs": { #聚合关键字
"man": { #聚合名字
"nested": { #关键字
"path": "man" #嵌套字段
},
"aggs": {
"avg_age": {
"avg": {
"field": "man.age" #子字段
}
}
}
}
}
}
嵌套聚合
- bucket+bucket
GET bank/_search
{
"size": 0,
"aggs": {
"state": { #名字
"terms": { #关键字
"field": "state.keyword" #按照不同国家分桶
},
"aggs": { #嵌套
"range_age": { #名字
"range": { #关键字
"field": "age",
"ranges": [
{
"from": 20,
"to": 30
}
]
}
}
}
}
}
}
-------------------------->
"aggregations": {
"state": {
"doc_count_error_upper_bound": 20,
"sum_other_doc_count": 770,
"buckets": [
{
"key": "ID",
"doc_count": 27,
"range_age": {
"buckets": [
{
"key": "20.0-30.0",
"from": 20,
"to": 30,
"doc_count": 9
}
]
}
},
{
"key": "TX",
"doc_count": 27,
"range_age": {
"buckets": [
{
"key": "20.0-30.0",
"from": 20,
"to": 30,
"doc_count": 17
}
]
}
},
{
"key": "AL",
"doc_count": 25,
"range_age": {
"buckets": [
{
"key": "20.0-30.0",
"from": 20,
"to": 30,
"doc_count": 12
}
]
}
},................
- bucket+metrics
GET bank/_search
{
"size": 0,
"aggs": {
"state": { #桶名字
"terms": { #bucket聚合分析,按国家名分桶
"field": "state.keyword"
},
"aggs": { #嵌套
"avg_age": { #桶名字
"avg": { # metric聚合分析,求不同桶的age平均值
"field": "age"
}}}}}}
#结果------------->
"aggregations": {
"state": {
"doc_count_error_upper_bound": 20,
"sum_other_doc_count": 770,
"buckets": [
{
"key": "ID",
"doc_count": 27,
"avg_age": {
"value": 31.59259259259259
}
},
{
"key": "TX",
"doc_count": 27,
"avg_age": {
"value": 28.77777777777778
}
},
{
"key": "AL",
"doc_count": 25,
"avg_age": {
"value": 29.16
}
},
{
"key": "MD",
"doc_count": 25,
"avg_age": {
"value": 31.04
}
},
{
"key": "TN",
"doc_count": 23,
"avg_age": {
"value": 30.91304347826087
}
},
{
"key": "MA",
"doc_count": 21,
"avg_age": {
"value": 27.761904761904763
}
},
{
"key": "NC",
"doc_count": 21,
"avg_age": {
"value": 31.333333333333332
}
}
聚合分析的作用范围:
-
filter 只为某个聚合分析设定过滤条件,不改变整体过滤条件
# filter过滤条件只作用于host_priority_little聚合,不作用于host聚合
GET syslog-2018.07.13/_search
{
"size": 0,
"aggs": {
"host_priority_little": {
"filter": {
"range": {
"priority": {
"to":50
}
}
},
"aggs": {
"host": {
"terms": {
"field": "host",
"size": 2
}
}
}
},
"host":{
"terms": {
"field": "host",
"size":2
}
}
}
}
------------->
"aggregations": {
"host_priority_little": {
"doc_count": 2530,
"host": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 196,
"buckets": [
{
"key": "10.10.14.16",
"doc_count": 2108
},
{
"key": "10.10.12.171",
"doc_count": 226
}
]
}
},
"host": {
"doc_count_error_upper_bound": 593,
"sum_other_doc_count": 37640,
"buckets": [
{
"key": "10.10.14.248",
"doc_count": 7198
},
{
"key": "10.10.14.4",
"doc_count": 6494
}
]
}
} -
post_filter 过滤文档,但不影响聚合
GET syslog-2018.06.13/_search
{
"size":0,
"aggs": {
"host": {
"terms": {
"field": "host"
}
}
},
"post_filter": {
"range": {
"priority": {
"gte": 100
}}}}
---------------->
"hits": {
"total": 106, #post_filter只作用于命中的文档数,跟聚合无关
"max_score": 0,
"hits": []
},
"aggregations": { #聚合不管post_filter的过滤条件
"host": {
"doc_count_error_upper_bound": 430,
"sum_other_doc_count": 53134,
"buckets": [
{
"key": "10.10.14.248",
"doc_count": 19590
},
{
"key": "172.16.10.37",
"doc_count": 17625
},
...........
# 如果使用的query,则是命中文档数,并作用于聚合分析
GET syslog-2018.06.13/_search
{
"size":0,
"aggs": {
"host": {
"terms": {
"field": "host"
}
}
},
"query": {
"range": {
"priority": {
"gte": 100
}}}}
-------------------->
"hits": {
"total": 106, #根据query条件 命中的文档数
"max_score": 0,
"hits": []
},
"aggregations": { # 对query查询后的文档进行聚合
"host": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "172.16.10.253",
"doc_count": 106
}
]
}
} -
global 无视query过滤条件,基于全部文档分析
GET test_index/_search
{
"size": 0,
"query": {
"match": {
"name": "lin"
}
},
"aggs": {
"lin_age_avg": {
"avg": {
"field": "age"
}
},
"all":{
"global": {},
"aggs": {
"avg_age": {
"avg": {
"field": "age"
}}}}}}
---------------------------->
"aggregations": {
"all": {
"doc_count": 4,
"avg_age": {
"value": 25.25
}
},
"lin_age_avg": {
"value": 24.5
}
}
}
聚合分析的排序:
- 使用自带的关键数据进行排序
_count 按照文档数doc_count排序 。如果不指定order则默认按_count倒排
_key 按照key值排序
GET test_index/_search
{
"size": 0,
"aggs": {
"age": {
"terms": {
"field": "salary"
}}}}
-#-----------------> 不指定order时默认情况下,使用_count倒排
"aggregations": {
"age": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 8000,
"doc_count": 3
},
{
"key": 5000,
"doc_count": 2
},
{
"key": 4000,
"doc_count": 1
},
{
"key": 9000,
"doc_count": 1
}
]
}
}
# 使用排序的_term类型 ,按照key进行排序
GET test_index/_search
{
"size": 0,
"aggs": {
"age": {
"terms": {
"field": "salary",
"order": { # order关键词
"_key": "asc" #按照key升序排序
}}}}}}
------------------>
"aggregations": {
"age": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": 4000,
"doc_count": 1
},
{
"key": 5000,
"doc_count": 2
},
{
"key": 8000,
"doc_count": 3
},
{
"key": 9000,
"doc_count": 1
}
]
}
-
可以使用子聚合的结果进行排序
GET test_index/_search
{
"size": 0,
"aggs": {
"salary": {
"terms": {
"field": "salary",
"order": {
"avg_age": "asc" # 按照子聚合里面的avg_age升序排序
}
},
"aggs": { #子聚合
"avg_age": {
"avg": {
"field": "age"
}}}}}}}
----------------->
"buckets": [
{
"key": 9000,
"doc_count": 1,
"avg_age": {
"value": 21
}
},
{
"key": 4000,
"doc_count": 1,
"avg_age": {
"value": 22
}
},
{
"key": 8000,
"doc_count": 3,
"avg_age": {
"value": 24
}
},
{
"key": 5000,
"doc_count": 2,
"avg_age": {
"value": 25.5
}
}
] # 在有多值指标的情况下,需要修改一下
GET test_index/_search
{
"size": 0,
"aggs": {
"salary": {
"terms": {
"field": "salary",
"order": {
"stats_age.avg": "asc" # 按照 stats中的avg 排序,使用.
}
},
"aggs": {
"stats_age": {
"stats": {
"field": "age"
}}}}}}
------------->
"buckets": [
{
"key": 9000,
"doc_count": 1,
"stats_age": {
"count": 1,
"min": 21,
"max": 21,
"avg": 21,
"sum": 21
}
},
{
"key": 4000,
"doc_count": 1,
"stats_age": {
"count": 1,
"min": 22,
"max": 22,
"avg": 22,
"sum": 22
}
},
.....................
]
#例2:
GET test_index/_search
{
"size": 0,
"aggs": {
"salary": {
"terms": {
"field": "salary",
"order": {
"filter_age>stats_age.sum": "asc" #注意这里的写法
}
},
"aggs": { #子聚合
"filter_age":{
"filter": { #过滤age>21的
"range": {
"age": {
"gt": 21
}
}
},
"aggs": {
"stats_age": {
"stats": { #多值分析
"field": "age"
}}}}}}}}
------------------------->
"buckets": [
{
"key": 9000,
"doc_count": 1,
"filter_age": {
"doc_count": 0,
"stats_age": {
"count": 0,
"min": null,
"max": null,
"avg": null,
"sum": null
}
}
},
{
"key": 4000,
"doc_count": 1,
"filter_age": {
"doc_count": 1,
"stats_age": {
"count": 1,
"min": 22,
"max": 22,
"avg": 22,
"sum": 22
}
}
},
{
"key": 5000,
"doc_count": 2,
"filter_age": {
"doc_count": 2,
"stats_age": {
"count": 2,
"min": 23,
"max": 28,
"avg": 25.5,
"sum": 51
}
}
},
{
"key": 8000,
"doc_count": 3,
"filter_age": {
"doc_count": 3,
"stats_age": {
"count": 3,
"min": 22,
"max": 26,
"avg": 24,
"sum": 72
}
}
}
]
- Pipeline 根据输出位置的不同 分为两类:
-parent 结果内嵌到现有的聚合分析结果中
- derivative 求导 计算父级histgram(date_histgram)中指定指标的导数
- cumulati average 累计总和 计算父histgram(date_histgram)中指定指标的累计总和。
- moving average 移动平均值 聚合将动态移动数据窗口,生成该窗口数据的平均值。
-sibling 结果与现有聚合结果同级
-
max_bucket /min_bucket / avg_bucket / sum_bucket
{
"max_bucket": {
"buckets_path": "the_sum"
}
}GET bank/_search
{
"size": 0,
"aggs": {
"state": { #聚合名字
"terms": { # 按照不同国家分桶
"field": "state.keyword"
},
"aggs": {
"avg_age": { # 聚合名字
"avg": { # 各个桶的age的平均值
"field": "age"
}
}
}
},
"max_state_age":{ #pipeline聚合名字, 跟state聚合同级
"max_bucket": { #关键字
"buckets_path": "state>avg_age" # 各个国家平均值中的最大值 >state表示包含在state里的avg_age
}
}
}
}
------------------------>
"aggregations": {
"state": {
"doc_count_error_upper_bound": 20,
"sum_other_doc_count": 770,
"buckets": [
{
"key": "MA",
"doc_count": 21,
"avg_age": {
"value": 27.761904761904763
}
},
{
"key": "NC",
"doc_count": 21,
"avg_age": {
"value": 31.333333333333332
}
},
{
"key": "ND",
"doc_count": 21,
"avg_age": {
"value": 31.238095238095237
}}]
},
"max_state_age": {
"value": 31.59259259259259,
"keys": [
"ID"
]}} sql语句:
SELECT COUNT(DISTINCT mac,ip) FROM test
对应的es语句:
GET nginx-access-log-2018.07.24/_search
{
"size": 0,
"aggs": {
"beat": {
"terms": { #先对beat分桶
"field": "beat.name"
},
"aggs": {
"ip": { #桶内ip去重
"cardinality": {
"field": "clientip"
}
}
}
},
"sum_beat_ip":{ #不同桶的ip总数
"sum_bucket": {
"buckets_path": "beat>ip"
}
}
}
}-
stats_bucket / extended_stats_bucket
GET bank/_search
{
"size": 0,
"aggs": {
"state": {
"terms": {
"field": "state.keyword"
},
"aggs": {
"min_age": {
"min": {
"field": "age"
}
}
}
},
"stats_state_age":{
"stats_bucket": {
"buckets_path": "state>min_age"
}
}
}
}
--------------->
"aggregations": {
"state": {
"doc_count_error_upper_bound": 20,
"sum_other_doc_count": 770,
"buckets": [
{
"key": "ID",
"doc_count": 27,
"min_age": {
"value": 21
}
},
{
"key": "MD",
"doc_count": 25,
"min_age": {
"value": 20
}
},
{
"key": "ND",
"doc_count": 21,
"min_age": {
"value": 21
}
},
{
"key": "ME",
"doc_count": 20,
"min_age": {
"value": 21
}
}
]
},
"stats_state_age": {
"count": 10,
"min": 20,
"max": 22,
"avg": 20.6,
"sum": 206
}
}
}{
"stats_bucket": {
"buckets_path": "the_sum"
}
} -
percentiles
GET bank/_search
{
"size": 0,
"aggs": {
"state": {
"terms": {
"field": "state.keyword"
},
"aggs": {
"avg_age": {
"avg": {
"field": "age"
}
}
}
},
"percen_state_age":{
"percentiles_bucket":{
"buckets_path":"state>avg_age"
}
}
}
}
-------->
"aggregations": {
"state": {
"doc_count_error_upper_bound": 20,
"sum_other_doc_count": 770,
"buckets": [
{
"key": "ID",
"doc_count": 27,
"avg_age": {
"value": 31.59259259259259
}
},
{
"key": "TX",
"doc_count": 27,
"avg_age": {
"value": 28.77777777777778
}
},..................
]
},
"percen_state_age": {
"values": {
"1.0": 27.761904761904763,
"5.0": 27.761904761904763,
"25.0": 28.77777777777778,
"50.0": 30.91304347826087,
"75.0": 31.238095238095237,
"95.0": 31.59259259259259,
"99.0": 31.59259259259259
}
}
}
- Matrix