###############################################################################1#2# CoCalc: Collaborative Calculation in the Cloud3#4# Copyright (C) 2016 -- 2017, SageMath, Inc.5#6# This program is free software: you can redistribute it and/or modify7# it under the terms of the GNU General Public License as published by8# the Free Software Foundation, either version 3 of the License, or9# (at your option) any later version.10#11# This program is distributed in the hope that it will be useful,12# but WITHOUT ANY WARRANTY; without even the implied warranty of13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the14# GNU General Public License for more details.15#16# You should have received a copy of the GNU General Public License17# along with this program. If not, see <http://www.gnu.org/licenses/>.18#19###############################################################################2021# This is a small helper class to record real-time metrics about the hub.22# It is designed for the hub, such that a local process can easily check its health.23# After an initial version, this has been repurposed to use prometheus.24# It wraps its client elements and adds some instrumentation to some hub components.2526fs = require('fs')27path = require('path')28underscore = require('underscore')29{execSync} = require('child_process')30{defaults} = misc = require('smc-util/misc')3132# Prometheus client setup -- https://github.com/siimon/prom-client33prom_client = require('prom-client')34prom_default_metrics = prom_client.defaultMetrics35# additionally, record GC statistics36# https://www.npmjs.com/package/prometheus-gc-stats37require('prometheus-gc-stats')()()3839# some constants40FREQ_s = 10 # update stats every FREQ seconds41DELAY_s = 5 # with an initial delay of DELAY seconds42#DISC_LEN = 10 # length of queue for recording discrete values43#MAX_BUFFER = 1000 # max. size of buffered values, which are cleared in the @_update step4445# CLK_TCK (usually 100, but maybe not ...)46try47CLK_TCK = parseInt(execSync('getconf CLK_TCK', {encoding: 'utf8'}))48catch err49CLK_TCK = null5051###52# exponential smoothing, based on linux's load 1-exp(-1) smoothing53# with compensation for sampling time FREQ_s54d = 1 - Math.pow(Math.exp(-1), FREQ_s / 60)55DECAY = [d, Math.pow(d, 5), Math.pow(d, 15)]56###5758###59# there is more than just continuous values60# cont: continuous (like number of changefeeds), will be smoothed61# disc: discrete, like blocked, will be recorded with timestamp62# in a queue of length DISC_LEN63exports.TYPE = TYPE =64COUNT: 'counter' # strictly non-decrasing integer65GAUGE: 'gauge' # only the most recent value is recorded66LAST : 'latest' # only the most recent value is recorded67DISC : 'discrete' # timeseries of length DISC_LEN68CONT : 'continuous' # continuous with exponential decay69MAX : 'contmax' # like CONT, reduces buffer to max value70SUM : 'contsum' # like CONT, reduces buffer to sum of values divided by FREQ_s71###7273exports.new_counter = new_counter = (name, help, labels) ->74# a prometheus counter -- https://github.com/siimon/prom-client#counter75# use it like counter.labels(labelA, labelB).inc([positive number or default is 1])76if not name.endsWith('_total')77throw "Counter metric names have to end in [_unit]_total but I got '#{name}' -- https://prometheus.io/docs/practices/naming/"78return new prom_client.Counter(name, help, labels)7980exports.new_gauge = new_gauge = (name, help, labels) ->81# a prometheus gauge -- https://github.com/siimon/prom-client#gauge82# basically, use it like gauge.labels(labelA, labelB).set(value)83return new prom_client.Gauge(name, help, labels)8485exports.new_quantile = new_quantile = (name, help, config={}) ->86# invoked as quantile.observe(value)87config = defaults config,88# a few more than the default, in particular including the actual min and max89percentiles: [0.0, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 1.0]90labels : []91return new prom_client.Summary(name, help, config.labels, percentiles: config.percentiles)92exports.new_summary = new_summary = new_quantile9394exports.new_histogram = new_histogram = (name, help, config={}) ->95# invoked as histogram.observe(value)96config = defaults config,97buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]98labels: []99return new prom_client.Histogram(name, help, config.labels, buckets:config.buckets)100101class MetricsRecorder102constructor: (@dbg, cb) ->103###104* @dbg: reporting via winston, instance with configuration passed in from hub.coffee105###106# stores the current state of the statistics107@_stats = {}108@_types = {} # key → TYPE.T mapping109110# the full statistic111@_data = {}112@_collectors = []113114# initialization finished115@setup_monitoring()116cb?(undefined, @)117118metrics: =>119###120get a serialized representation of the metrics status121(was a dict that should be JSON, now it is for prometheus)122it's only called by hub_http_server for the /metrics endpoint123###124return prom_client.register.metrics()125126register_collector: (collector) =>127# The added collector functions will be evaluated periodically to gather metrics128@_collectors.push(collector)129130setup_monitoring: =>131# setup monitoring of some components132# called by the hub *after* setting up the DB, etc.133num_clients_gauge = new_gauge('clients_count', 'Number of connected clients')134{number_of_clients} = require('./hub_register')135@register_collector ->136try137num_clients_gauge.set(number_of_clients())138catch139num_clients_gauge.set(0)140141# our own CPU metrics monitor, separating user and sys!142# it's actually a counter, since it is non-decreasing, but we'll use .set(...)143@_cpu_seconds_total = new_gauge('process_cpu_categorized_seconds_total', 'Total number of CPU seconds used', ['type'])144145# init periodically calling @_collect146setTimeout((=> setInterval(@_collect, FREQ_s * 1000)), DELAY_s * 1000)147148_collect: =>149# called by @_update to evaluate the collector functions150#@dbg('_collect called')151for c in @_collectors152c()153# linux specific: collecting this process and all its children sys+user times154# http://man7.org/linux/man-pages/man5/proc.5.html155fs.readFile path.join('/proc', ''+process.pid, 'stat'), 'utf8', (err, infos) =>156if err or not CLK_TCK?157@dbg("_collect err: #{err}")158return159# there might be spaces in the process name, hence split after the closing bracket!160infos = infos[infos.lastIndexOf(')') + 2...].split(' ')161@_cpu_seconds_total.labels('user') .set(parseFloat(infos[11]) / CLK_TCK)162@_cpu_seconds_total.labels('system') .set(parseFloat(infos[12]) / CLK_TCK)163# time spent waiting on child processes164@_cpu_seconds_total.labels('chld_user') .set(parseFloat(infos[13]) / CLK_TCK)165@_cpu_seconds_total.labels('chld_system').set(parseFloat(infos[14]) / CLK_TCK)166167168169# some of the commented code below might be used in the future when periodically collecting data (e.g. sliding max of "concurrent" value)170###171# every FREQ_s the _data dict is being updated172# e.g current value, exp decay, later on also "intelligent" min/max, etc.173_update: ->174@_collect()175176smooth = (new_value, arr) ->177arr ?= []178arr[0] = new_value179# compute smoothed value `sval` for each decay param180for d, idx in DECAY181sval = arr[idx + 1] ? new_value182sval = d * new_value + (1-d) * sval183arr[idx + 1] = sval184return arr185186for key, values of @_stats187# if no new value is available, we have to create one for smoothing188if not values?.length > 0189# fallback to last, unless discrete190if @_types[key] != TYPE.DISC191[..., value] = @_data[key]192# in case _data[key] is empty, abort193if not value?194continue195# sum is special case, because of sum/FREQ_s below196if @_types[key] == TYPE.SUM197value *= FREQ_s198# one-element array199values = [value]200else201values = []202203# computing the updated value for the @_data entries204switch @_types[key]205when TYPE.MAX206@_data[key] = smooth(values[0], @_data[key])207208when TYPE.CONT209# compute the average value (TODO median?)210sum = underscore.reduce(values, ((a, b) -> a+b), 0)211avg = sum / values.length212@_data[key] = smooth(avg, @_data[key])213214when TYPE.SUM215# compute the cumulative sum per second (e.g. database modifications)216sum = underscore.reduce(values, ((a, b) -> a+b), 0)217sum /= FREQ_s # to get a per 1s value!218@_data[key] = smooth(sum, @_data[key])219220when TYPE.DISC221# this is a pair [timestamp, discrete value], appended to the data queue222queue = @_data[key] ? []223@_data[key] = [queue..., values...][-DISC_LEN..]224225when TYPE.LAST226if values?.length > 0227# ... just store the most recent one228@_data[key] = values[0]229230# we've consumed the value(s), reset them231@_stats[key] = []232233record: (key, value, type = TYPE.CONT) =>234# store in @_stats a key → bounded array235if (@_types[key] ? type) != type236@dbg("WARNING: you are switching types from #{@_types[key]} to #{type} -- IGNORED")237return238@_types[key] = type239switch type240when TYPE.LAST241@_stats[key] = [value]242when TYPE.CONT, TYPE.SUM243arr = @_stats[key] ? []244@_stats[key] = [arr..., value]245when TYPE.MAX246current = @_stats[key] ? Number.NEGATIVE_INFINITY247@_stats[key] = [Math.max(value, current)]248when TYPE.DISC249ts = (new Date()).toISOString()250arr = @_stats[key] ? []251@_stats[key] = [arr..., [ts, value]]252else253@dbg?('hub/record_stats: unknown or undefined type #{type}')254# avoid overflows255@_stats[key] = @_stats[key][-MAX_BUFFER..]256###257258metricsRecorder = null259exports.init = (winston, cb) ->260dbg = (msg) ->261winston.info("MetricsRecorder: #{msg}")262metricsRecorder = new MetricsRecorder(dbg, cb)263264exports.get = ->265return metricsRecorder266267268