blob: e0c1360c4cb36cfcda419b7810cd7ac474e4d992 [file] [log] [blame]
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import argparse
import urllib.request
from urllib.error import URLError
import json
import os
import yaml
import time
import sfconfig.utils
#############################
# Influxdb query management #
#############################
CONTINOUS_QUERY = {
'zuul.all_jobs': {
'measurement': '/zuul.tenant.*.pipeline.*.all_jobs/',
},
}
NODE_STATUS = ("building", "ready", "in-use", "used",
"deleting", "hold", "failed")
for node_status in NODE_STATUS:
CONTINOUS_QUERY['nodepool.all_providers.%s' % node_status] = {
'measurement': '/nodepool.provider.*.nodes.%s/' % node_status}
def update_cq_action(args):
'''Update influx db continuous query, return 4 on changed'''
if args.influxdb_host is None or args.influxdb_password is None:
sfconfig.utils.fail("Host and password is required")
influx_cmd = ['influx', '-ssl', '-host', args.influxdb_host,
'-port', '8086', '-username', 'admin', '-password',
args.influxdb_password, '-database', 'telegraf', '-execute']
if sfconfig.utils.pread(["podman", "container", "list", "-a", "--format",
"'{{ .Names}}'", "--filter", "name=^influxdb$"]):
influx_cmd = ["podman", "exec", "-t", "influxdb"] + influx_cmd
queries = sfconfig.utils.pread(
influx_cmd + ["SHOW CONTINUOUS QUERIES"]).split('\n')
changed = False
for cq, cq_data in CONTINOUS_QUERY.items():
cmd = 'CREATE CONTINUOUS QUERY "qc_%s" ON telegraf' % cq
cmd += ' RESAMPLE FOR %s' % cq_data.get('time', '30s')
cmd += ' BEGIN SELECT %s(value) AS value' % cq_data.get('function',
'sum')
cmd += ' INTO "%s"' % cq
cmd += ' FROM %s' % cq_data['measurement']
cmd += ' GROUP BY time(%s)' % cq_data.get('time', '30s')
cmd += ' fill(%s)' % cq_data.get('resample_fill', '0')
cmd += ' END'
to_create = True
for query in queries:
if query.startswith("qc_%s " % cq):
# Query name already exists, check if it needs update
query = query.split(None, 1)[1]
# Fix query creation command to match show command output
qmd = cmd.replace("INTO ", "INTO telegraf.autogen.")
qmd = qmd.replace("FROM ", "FROM telegraf.autogen.")
if query == qmd:
to_create = False
else:
# ALTER doesn't exists, we need to delete the previous one
print("Removing old %s" % cq)
sfconfig.utils.execute(influx_cmd + [
'DROP CONTINUOUS QUERY "qc_%s" ON "telegraf"' % cq])
break
if to_create:
changed = True
print("Adding %s" % cq)
sfconfig.utils.execute(influx_cmd + [cmd])
return 4 if changed else 0
#######################
# Grafyaml generation #
#######################
def dashboard(name, description):
desc = "**%s**\n\n%s" % (
description,
'This dashboard is automatically generated by sfconfig.'
' If you would like to make changes, check the graph_render module'
)
return {
'title': name,
'rows': [{
'title': 'Description',
'height': '100px',
'panels': [{
'title': 'Description',
'type': 'text',
'content': desc,
}]
}
]}
def row(title, height='150px', show_title=True):
return {
'title': title.capitalize(),
'height': height,
'showTitle': show_title,
'panels': []}
def single_stat(title, measurement, span=2, tags=True):
stat = {
'title': title.capitalize(),
'span': span,
'sparkline': {'full': True, 'show': True},
'targets': [{
'groupBy': [{'type': 'time', 'params': ['$__interval']},
{'type': 'fill', 'params': ['0']}],
'measurement': measurement,
'orderByTime': 'ASC',
'resultFormat': 'time_series',
'select': [[{'type': 'field', 'params': ['value']},
{'type': 'distinct', 'params': []}]],
}],
'type': 'singlestat',
'valueName': 'current',
}
if tags:
for target in stat['targets']:
target['tags'] = [{'key': 'metric_type', 'operator': '=',
'value': 'gauge'}]
return stat
def graph(title, metrics, span=4, interval='$__interval', scale=None,
select_fct='count', field='value', stack=False, tooltip=None,
seriesOverrides=None, legend=None, yaxes_format=None):
targets = []
for metric in metrics:
if not metric.get('alias'):
metric['alias'] = metric['m'].split('.')[-1].replace(
'/', '').replace('_', ' ').capitalize()
select = [
{'type': 'field', 'params': [field]},
{'type': select_fct, 'params': []},
]
if scale:
select.append({'type': 'math', 'params': [scale]})
targets.append({
'groupBy': [{'type': 'time', 'params': [interval]},
{'type': 'fill', 'params': ['0']}],
'measurement': metric['m'],
'alias': metric['alias'],
'orderByTime': 'ASC',
'resultFormat': 'time_series',
'select': [select],
'tags': [{'key': 'metric_type',
'operator': '=',
'value': metric.get('type', 'gauge')}]
})
g = {
'title': title.capitalize(),
'span': span,
'targets': targets,
'type': 'graph',
'yaxes': [{"min": 0}, {"min": 0}]
}
if legend:
g['legend'] = legend
if yaxes_format:
g['yaxes'] = [yaxes_format, yaxes_format]
if stack:
g['stack'] = True
if seriesOverrides:
g['seriesOverrides'] = [{'alias': seriesOverrides, 'stack': False}]
if tooltip:
g['tooltip'] = {'value_type': tooltip}
return g
def zuul_dashboard(args):
d = dashboard("Zuul Status", "The Zuul service metrics")
for tenant in args.tenants:
tenant_row = row("%s's pipelines" % tenant['name'].capitalize())
for pipeline in tenant['pipelines']:
span = 3 if pipeline in ("check", "gate") else 2
tenant_row['panels'].append(single_stat(
pipeline,
'zuul.tenant.%s.pipeline.%s.current_changes' % (
tenant['name'], pipeline),
span))
d['rows'].append(tenant_row)
global_row = row("Global stats", height="250px")
global_row["panels"].append(graph(
'Jobs Launched (per Hour)',
[{'m': '/zuul.tenant.*.pipeline.*.all_jobs/', 'alias': '$5',
'type': 'counter'}], select_fct='distinct', interval='1h'))
global_row["panels"].append(graph(
'Node Requests',
[{'m': 'zuul.nodepool.current_requests', 'alias': 'Requests'}],
select_fct='distinct'))
global_row["panels"].append(graph(
'Job Queue',
[{'m': 'zuul.geard.queue.running'},
{'m': 'zuul.geard.queue.waiting'},
{'m': 'zuul.geard.queue.total'}],
select_fct='distinct'))
global_row["panels"].append(graph(
'Gerrit Events (per Hour)',
[{'m': 'zuul.event.gerrit.comment-added', 'type': 'counter'},
{'m': 'zuul.event.gerrit.patchset-created', 'type': 'counter'},
{'m': 'zuul.event.gerrit.change-merged', 'type': 'counter'}],
interval='1h', select_fct='distinct'))
global_row["panels"].append(graph(
'Test Nodes',
[{'m': 'nodepool.nodes.building'},
{'m': 'nodepool.nodes.ready'},
{'m': 'nodepool.nodes.in-use'},
{'m': 'nodepool.nodes.used'},
{'m': 'nodepool.nodes.deleting'}, ],
select_fct='distinct',
stack=True,
seriesOverrides='Max',
tooltip='individual'))
d["rows"].append(global_row)
executors_row = row("Executors", height="250px")
executors_row["panels"].append(graph(
"Executors",
[{"m": "zuul.executors.online"}, {"m": "zuul.executors.accepting"}],
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Executors Queue",
[{"m": "zuul.executors.jobs_queued"},
{"m": "zuul.executors.jobs_running"}],
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Starting Builds",
[{"m": "/zuul.executor.*.starting_builds/", 'alias': '$2'}],
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Running Builds",
[{"m": "/zuul.executor.*.running_builds/", 'alias': '$2'}],
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Load Average",
[{"m": "/zuul.executor.*.load_average/", 'alias': '$2'}],
scale="/ 100",
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Used HDD",
[{"m": "/zuul.executor.*.pct_used_hdd/", 'alias': '$2'}],
scale="/ 100",
select_fct='distinct',
span=6))
executors_row["panels"].append(graph(
"Used RAM",
[{"m": "/zuul.executor.*.pct_used_ram/", 'alias': '$2'}],
scale="/ 100",
select_fct='distinct',
span=6))
d["rows"].append(executors_row)
mergers_row = row("Mergers", height="250px")
mergers_row["panels"].append(graph(
"Mergers", [{"m": "zuul.mergers.online"}], span=6,
select_fct='distinct'))
mergers_row["panels"].append(graph(
"Merger Queue", [
{"m": "zuul.mergers.jobs_queued"},
{"m": "zuul.mergers.jobs_running"}], span=6,
select_fct='distinct'))
d["rows"].append(mergers_row)
return d
def nodepool_dashboard(args):
d = dashboard("Nodepool Status", "The Nodepool service metrics")
nodes_row = row("Nodes", height="150px")
for status in NODE_STATUS:
nodes_row["panels"].append(single_stat(
status,
'/nodepool.all_providers.%s/' % status,
tags=False
))
d["rows"].append(nodes_row)
images_row = row("Images", height="320px")
for status in ("building", "ready", "in-use", "deleting"):
images_row["panels"].append(graph(
status,
[{"m": "/nodepool.label.*.nodes.%s/" % status, 'alias': '$2'}],
span=3))
d["rows"].append(images_row)
launch_row = row("Launches", height="250px")
launch_row["panels"].append(graph(
'Ready Node Launch Attempts',
[{'m': '/nodepool.launch.provider.*.ready/', 'alias': '$3',
'type': 'counter'}], yaxes_format={'label': 'events/min'},
interval='1m'))
launch_row["panels"].append(graph(
'Error Node Launch Attempts',
[{'m': '/nodepool.launch.provider.*.error/', 'alias': '$5',
'type': 'counter'}], yaxes_format={'label': 'events/min'},
interval='1m'))
launch_row["panels"].append(graph(
'Time to Ready',
[{'m': '/nodepool.launch.provider.*.ready/', 'alias': '$3',
'type': 'timing'}], select_fct='distinct', field='mean',
yaxes_format={'label': 'time', 'format': 'ms'}))
d["rows"].append(launch_row)
return d
def provider_dashboard(args, provider):
d = dashboard("Provider - %s" % provider["name"],
"The Nodepool provider %s metrics" % provider["name"])
nodes_row = row("Nodes", height="150px")
for status in NODE_STATUS:
nodes_row["panels"].append(single_stat(
status,
'/nodepool.provider.%s.nodes.%s/' % (provider["name"], status)))
d["rows"].append(nodes_row)
if provider["driver"] == "openstack":
api_row = row("API Operations", height="250px")
for name, metric in (("Create Server", "ComputePostServers"),
("Get Server", "ComputeGetServersDetail"),
("Delete Server", "ComputeDeleteServers"),
("List Server", "ComputeGetServers"),
("Get Limits", "ComputeGetLimits")):
api_row["panels"].append(graph(
name, [{"m": "nodepool.task.%s.%s" % (
provider["name"], metric), 'type': 'timing',
'alias': provider["name"]}], select_fct='distinct',
field='mean', yaxes_format={'label': 'time', 'format': 'ms'},
span=4))
d["rows"].append(api_row)
launch_row = row("Node Launches", height="250px")
launch_row["panels"].append(graph(
'Ready Node Launch Attempts',
[{'m': '/nodepool.launch.provider.%s.ready/' % provider["name"],
'alias': '$3', 'type': 'counter'}], interval='1m',
yaxes_format={'label': 'events/min'}))
launch_row["panels"].append(graph(
'Error Node Launch Attempts',
[{'m': '/nodepool.launch.provider.%s.error/' % provider["name"],
'alias': '$5', 'type': 'counter'}], interval='1m',
yaxes_format={'label': 'events/min'}))
launch_row["panels"].append(graph(
'Time to Ready',
[{'m': '/nodepool.launch.provider.%s.ready/' % provider["name"],
'alias': '$3', 'type': 'timing'}], select_fct='distinct',
field='mean', yaxes_format={'label': 'time', 'format': 'ms'}))
launch_row["panels"].append(graph(
'Test Nodes',
[{'m': 'nodepool.provider.%s.nodes.building' % provider["name"]},
{'m': 'nodepool.provider.%s.nodes.ready' % provider["name"]},
{'m': 'nodepool.provider.%s.nodes.in-use' % provider["name"]},
{'m': 'nodepool.provider.%s.nodes.used' % provider["name"]},
{'m': 'nodepool.provider.%s.nodes.deleting' % provider["name"]}, ],
select_fct='distinct',
stack=True,
seriesOverrides='Max',
yaxes_format={'label': 'node'},
tooltip='individual'))
d["rows"].append(launch_row)
return d
def update_grafyaml_action(args):
'''Update grafyaml file, return 4 on changed'''
if args.zuul_url is None or args.config_dir is None or \
args.output_dir is None:
sfconfig.utils.fail("Zuul url and config, output dir are required")
args.tenants = get_tenants(args.zuul_url)
args.providers = get_providers(args.config_dir)
data = {
"_zuul.yaml": zuul_dashboard(args),
"_nodepool.yaml": nodepool_dashboard(args)
}
for provider in args.providers:
data["_nodepool-%s.yaml" % provider["name"]] = provider_dashboard(
args, provider)
changed = False
for name, data in data.items():
content = yaml.safe_dump({'dashboard': data},
default_flow_style=False)
graf_file = os.path.join(args.output_dir, name)
if os.path.exists(graf_file) and open(graf_file).read() == content:
continue
changed = True
open(graf_file, "w").write(content)
print("%s: updated content" % graf_file)
return 4 if changed else 0
############################
# Deployment configuration #
############################
def get(url):
for _ in range(12):
try:
return urllib.request.urlopen(url)
except URLError as err:
print("Unable to fetch %s due to: %s (will retry in 10s)" % (
url, err))
time.sleep(10)
raise RuntimeError("Unable to fetch %s due to: %s (after 6 retries)")
def get_tenants(zuul_url):
result = []
tenants = json.loads(get("%s/api/tenants" % zuul_url).read())
for tenant in tenants:
tenant_data = {"name": tenant["name"], "pipelines": []}
pipelines = json.loads(get(
"%s/api/tenant/%s/pipelines" % (zuul_url, tenant["name"])).read())
for pipeline in pipelines:
tenant_data["pipelines"].append(pipeline["name"])
result.append(tenant_data)
return result
def get_providers(config_dir):
result = []
nodepool_conf = sfconfig.utils.yaml_merge_load(os.path.join(config_dir,
"nodepool"))
for provider in nodepool_conf.get("providers", []):
result.append(
{"name": provider["name"],
"driver": provider.get("driver", "openstack")})
return result
#########
# Usage #
#########
def main():
p = argparse.ArgumentParser(
description="Generate dynamic grafyaml dashboard")
p.add_argument("--influxdb-host")
p.add_argument("--influxdb-password")
p.add_argument("--zuul-url")
p.add_argument("--config-dir")
p.add_argument("--output-dir")
p.add_argument("action", choices=("update-cq", "update-grafyaml"))
args = p.parse_args()
if args.action == "update-cq":
ret = update_cq_action(args)
elif args.action == "update-grafyaml":
ret = update_grafyaml_action(args)
exit(ret)
if __name__ == "__main__":
main()