diff --git a/README.md b/README.md index 2ad5b73f..abbcd6b5 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,9 @@ The following environment variables configure the exporter: * `PG_EXPORTER_EXCLUDE_DATABASES` A comma-separated list of databases to remove when autoDiscoverDatabases is enabled. Default is empty string. +* `PG_EXPORTER_METRIC_PREFIX` + A prefix to use for each of the default metrics exported by postgres-exporter. Default is `pg` + Settings set by environment variables starting with `PG_` will be overwritten by the corresponding CLI flag if given. ### Setting the Postgres server's data source name diff --git a/cmd/postgres_exporter/postgres_exporter.go b/cmd/postgres_exporter/postgres_exporter.go index 0f941616..8a3e1617 100644 --- a/cmd/postgres_exporter/postgres_exporter.go +++ b/cmd/postgres_exporter/postgres_exporter.go @@ -53,6 +53,7 @@ var ( onlyDumpMaps = kingpin.Flag("dumpmaps", "Do not run, simply dump the maps.").Bool() constantLabelsList = kingpin.Flag("constantLabels", "A list of label=value separated by comma(,).").Default("").Envar("PG_EXPORTER_CONSTANT_LABELS").String() excludeDatabases = kingpin.Flag("exclude-databases", "A list of databases to remove when autoDiscoverDatabases is enabled").Default("").Envar("PG_EXPORTER_EXCLUDE_DATABASES").String() + metricPrefix = kingpin.Flag("metric-prefix", "A metric prefix can be used to have non-default (not \"pg\") prefixes for each of the metrics").Default("pg").Envar("PG_EXPORTER_METRIC_PREFIX").String() ) // Metric name parts. @@ -80,6 +81,7 @@ const ( GAUGE ColumnUsage = iota // Use this column as a gauge MAPPEDMETRIC ColumnUsage = iota // Use this column with the supplied mapping of text values DURATION ColumnUsage = iota // This column should be interpreted as a text duration (and converted to milliseconds) + HISTOGRAM ColumnUsage = iota // Use this column as a histogram ) // UnmarshalYAML implements the yaml.Unmarshaller interface. @@ -169,6 +171,7 @@ type MetricMapNamespace struct { // be mapped to by the collector type MetricMap struct { discard bool // Should metric be discarded during mapping? + histogram bool // Should metric be treated as a histogram? vtype prometheus.ValueType // Prometheus valuetype desc *prometheus.Desc // Prometheus descriptor conversion func(interface{}) (float64, bool) // Conversion function to turn PG result into float64 @@ -376,7 +379,8 @@ var queryOverrides = map[string][]OverrideQuery{ ('sharelock'), ('sharerowexclusivelock'), ('exclusivelock'), - ('accessexclusivelock') + ('accessexclusivelock'), + ('sireadlock') ) AS tmp(mode) CROSS JOIN pg_database LEFT JOIN (SELECT database, lower(mode) AS mode,count(*) AS count @@ -598,6 +602,8 @@ func makeDescMap(pgVersion semver.Version, serverLabels prometheus.Labels, metri for namespace, intermediateMappings := range metricMaps { thisMap := make(map[string]MetricMap) + namespace = strings.Replace(namespace, "pg", *metricPrefix, 1) + // Get the constant labels var variableLabels []string for columnName, columnMapping := range intermediateMappings.columnMappings { @@ -650,6 +656,27 @@ func makeDescMap(pgVersion semver.Version, serverLabels prometheus.Labels, metri return dbToFloat64(in) }, } + case HISTOGRAM: + thisMap[columnName] = MetricMap{ + histogram: true, + vtype: prometheus.UntypedValue, + desc: prometheus.NewDesc(fmt.Sprintf("%s_%s", namespace, columnName), columnMapping.description, variableLabels, serverLabels), + conversion: func(in interface{}) (float64, bool) { + return dbToFloat64(in) + }, + } + thisMap[columnName+"_bucket"] = MetricMap{ + histogram: true, + discard: true, + } + thisMap[columnName+"_sum"] = MetricMap{ + histogram: true, + discard: true, + } + thisMap[columnName+"_count"] = MetricMap{ + histogram: true, + discard: true, + } case MAPPEDMETRIC: thisMap[columnName] = MetricMap{ vtype: prometheus.GaugeValue, @@ -721,6 +748,9 @@ func stringToColumnUsage(s string) (ColumnUsage, error) { case "GAUGE": u = GAUGE + case "HISTOGRAM": + u = HISTOGRAM + case "MAPPEDMETRIC": u = MAPPEDMETRIC @@ -772,6 +802,46 @@ func dbToFloat64(t interface{}) (float64, bool) { } } +// Convert database.sql types to uint64 for Prometheus consumption. Null types are mapped to 0. string and []byte +// types are mapped as 0 and !ok +func dbToUint64(t interface{}) (uint64, bool) { + switch v := t.(type) { + case uint64: + return v, true + case int64: + return uint64(v), true + case float64: + return uint64(v), true + case time.Time: + return uint64(v.Unix()), true + case []byte: + // Try and convert to string and then parse to a uint64 + strV := string(v) + result, err := strconv.ParseUint(strV, 10, 64) + if err != nil { + log.Infoln("Could not parse []byte:", err) + return 0, false + } + return result, true + case string: + result, err := strconv.ParseUint(v, 10, 64) + if err != nil { + log.Infoln("Could not parse string:", err) + return 0, false + } + return result, true + case bool: + if v { + return 1, true + } + return 0, true + case nil: + return 0, true + default: + return 0, false + } +} + // Convert database.sql to string for Prometheus labels. Null types are mapped to empty strings. func dbToString(t interface{}) (string, bool) { switch v := t.(type) { @@ -977,7 +1047,7 @@ func (s *Servers) GetServer(dsn string) (*Server, error) { var err error var ok bool errCount := 0 // start at zero because we increment before doing work - retries := 3 + retries := 1 var server *Server for { if errCount++; errCount > retries { @@ -1167,29 +1237,6 @@ func (e *Exporter) setupInternalMetrics() { // Describe implements prometheus.Collector. func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { - // We cannot know in advance what metrics the exporter will generate - // from Postgres. So we use the poor man's describe method: Run a collect - // and send the descriptors of all the collected metrics. The problem - // here is that we need to connect to the Postgres DB. If it is currently - // unavailable, the descriptors will be incomplete. Since this is a - // stand-alone exporter and not used as a library within other code - // implementing additional metrics, the worst that can happen is that we - // don't detect inconsistent metrics created by this exporter - // itself. Also, a change in the monitored Postgres instance may change the - // exported metrics during the runtime of the exporter. - metricCh := make(chan prometheus.Metric) - doneCh := make(chan struct{}) - - go func() { - for m := range metricCh { - ch <- m.Desc() - } - close(doneCh) - }() - - e.Collect(metricCh) - close(metricCh) - <-doneCh } // Collect implements prometheus.Collector. @@ -1304,13 +1351,68 @@ func queryNamespaceMapping(server *Server, namespace string, mapping MetricMapNa continue } - value, ok := dbToFloat64(columnData[idx]) - if !ok { - nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName, columnData[idx]))) - continue + if metricMapping.histogram { + var keys []float64 + err = pq.Array(&keys).Scan(columnData[idx]) + if err != nil { + return []prometheus.Metric{}, []error{}, errors.New(fmt.Sprintln("Error retrieving", columnName, "buckets:", namespace, err)) + } + + var values []int64 + valuesIdx, ok := columnIdx[columnName+"_bucket"] + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_bucket"))) + continue + } + err = pq.Array(&values).Scan(columnData[valuesIdx]) + if err != nil { + return []prometheus.Metric{}, []error{}, errors.New(fmt.Sprintln("Error retrieving", columnName, "bucket values:", namespace, err)) + } + + buckets := make(map[float64]uint64, len(keys)) + for i, key := range keys { + if i >= len(values) { + break + } + buckets[key] = uint64(values[i]) + } + + idx, ok = columnIdx[columnName+"_sum"] + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_sum"))) + continue + } + sum, ok := dbToFloat64(columnData[idx]) + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName+"_sum", columnData[idx]))) + continue + } + + idx, ok = columnIdx[columnName+"_count"] + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Missing column: ", namespace, columnName+"_count"))) + continue + } + count, ok := dbToUint64(columnData[idx]) + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName+"_count", columnData[idx]))) + continue + } + + metric = prometheus.MustNewConstHistogram( + metricMapping.desc, + count, sum, buckets, + labels..., + ) + } else { + value, ok := dbToFloat64(columnData[idx]) + if !ok { + nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", namespace, columnName, columnData[idx]))) + continue + } + // Generate the metric + metric = prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value, labels...) } - // Generate the metric - metric = prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value, labels...) } else { // Unknown metric. Report as untyped if scan to float64 works, else note an error too. metricLabel := fmt.Sprintf("%s_%s", namespace, columnName) @@ -1515,20 +1617,36 @@ func (e *Exporter) scrape(ch chan<- prometheus.Metric) { } func (e *Exporter) discoverDatabaseDSNs() []string { + // connstring syntax is complex (and not sure if even regular). + // we don't need to parse it, so just superficially validate that it starts + // with a valid-ish keyword pair + connstringRe := regexp.MustCompile(`^ *[a-zA-Z0-9]+ *= *[^= ]+`) + dsns := make(map[string]struct{}) for _, dsn := range e.dsn { - parsedDSN, err := url.Parse(dsn) - if err != nil { - log.Errorf("Unable to parse DSN (%s): %v", loggableDSN(dsn), err) + var dsnURI *url.URL + var dsnConnstring string + + if strings.HasPrefix(dsn, "postgresql://") { + var err error + dsnURI, err = url.Parse(dsn) + if err != nil { + log.Errorf("Unable to parse DSN as URI (%s): %v", loggableDSN(dsn), err) + continue + } + } else if connstringRe.MatchString(dsn) { + dsnConnstring = dsn + } else { + log.Errorf("Unable to parse DSN as either URI or connstring (%s)", loggableDSN(dsn)) continue } - dsns[dsn] = struct{}{} server, err := e.servers.GetServer(dsn) if err != nil { log.Errorf("Error opening connection to database (%s): %v", loggableDSN(dsn), err) continue } + dsns[dsn] = struct{}{} // If autoDiscoverDatabases is true, set first dsn as master database (Default: false) server.master = true @@ -1542,8 +1660,16 @@ func (e *Exporter) discoverDatabaseDSNs() []string { if contains(e.excludeDatabases, databaseName) { continue } - parsedDSN.Path = databaseName - dsns[parsedDSN.String()] = struct{}{} + + if dsnURI != nil { + dsnURI.Path = databaseName + dsn = dsnURI.String() + } else { + // replacing one dbname with another is complicated. + // just append new dbname to override. + dsn = fmt.Sprintf("%s dbname=%s", dsnConnstring, databaseName) + } + dsns[dsn] = struct{}{} } } diff --git a/cmd/postgres_exporter/postgres_exporter_integration_test.go b/cmd/postgres_exporter/postgres_exporter_integration_test.go index 0363af96..d575692a 100644 --- a/cmd/postgres_exporter/postgres_exporter_integration_test.go +++ b/cmd/postgres_exporter/postgres_exporter_integration_test.go @@ -126,3 +126,26 @@ func (s *IntegrationSuite) TestUnknownMetricParsingDoesntCrash(c *C) { // scrape the exporter and make sure it works exporter.scrape(ch) } + +// TestExtendQueriesDoesntCrash tests that specifying extend.query-path doesn't +// crash. +func (s *IntegrationSuite) TestExtendQueriesDoesntCrash(c *C) { + // Setup a dummy channel to consume metrics + ch := make(chan prometheus.Metric, 100) + go func() { + for range ch { + } + }() + + dsn := os.Getenv("DATA_SOURCE_NAME") + c.Assert(dsn, Not(Equals), "") + + exporter := NewExporter( + strings.Split(dsn, ","), + WithUserQueriesPath("../user_queries_test.yaml"), + ) + c.Assert(exporter, NotNil) + + // scrape the exporter and make sure it works + exporter.scrape(ch) +} diff --git a/cmd/postgres_exporter/postgres_exporter_test.go b/cmd/postgres_exporter/postgres_exporter_test.go index 8222bf9d..0a471750 100644 --- a/cmd/postgres_exporter/postgres_exporter_test.go +++ b/cmd/postgres_exporter/postgres_exporter_test.go @@ -4,9 +4,11 @@ package main import ( "io/ioutil" + "math" "os" "reflect" "testing" + "time" "github.com/blang/semver" "github.com/prometheus/client_golang/prometheus" @@ -287,6 +289,22 @@ func UnsetEnvironment(c *C, d string) { c.Assert(err, IsNil) } +type isNaNChecker struct { + *CheckerInfo +} + +var IsNaN Checker = &isNaNChecker{ + &CheckerInfo{Name: "IsNaN", Params: []string{"value"}}, +} + +func (checker *isNaNChecker) Check(params []interface{}, names []string) (result bool, error string) { + param, ok := (params[0]).(float64) + if !ok { + return false, "obtained value type is not a float" + } + return math.IsNaN(param), "" +} + // test boolean metric type gets converted to float func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) { @@ -294,6 +312,7 @@ func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) { input interface{} expectedString string expectedValue float64 + expectedCount uint64 expectedOK bool } @@ -302,19 +321,71 @@ func (s *FunctionalSuite) TestBooleanConversionToValueAndString(c *C) { input: true, expectedString: "true", expectedValue: 1.0, + expectedCount: 1, expectedOK: true, }, { input: false, expectedString: "false", expectedValue: 0.0, + expectedCount: 0, + expectedOK: true, + }, + { + input: nil, + expectedString: "", + expectedValue: math.NaN(), + expectedCount: 0, + expectedOK: true, + }, + { + input: TestCase{}, + expectedString: "", + expectedValue: math.NaN(), + expectedCount: 0, + expectedOK: false, + }, + { + input: 123.0, + expectedString: "123", + expectedValue: 123.0, + expectedCount: 123, + expectedOK: true, + }, + { + input: "123", + expectedString: "123", + expectedValue: 123.0, + expectedCount: 123, + expectedOK: true, + }, + { + input: []byte("123"), + expectedString: "123", + expectedValue: 123.0, + expectedCount: 123, + expectedOK: true, + }, + { + input: time.Unix(1600000000, 0), + expectedString: "1600000000", + expectedValue: 1600000000.0, + expectedCount: 1600000000, expectedOK: true, }, } for _, cs := range cases { value, ok := dbToFloat64(cs.input) - c.Assert(value, Equals, cs.expectedValue) + if math.IsNaN(cs.expectedValue) { + c.Assert(value, IsNaN) + } else { + c.Assert(value, Equals, cs.expectedValue) + } + c.Assert(ok, Equals, cs.expectedOK) + + count, ok := dbToUint64(cs.input) + c.Assert(count, Equals, cs.expectedCount) c.Assert(ok, Equals, cs.expectedOK) str, ok := dbToString(cs.input) diff --git a/cmd/postgres_exporter/tests/user_queries_test.yaml b/cmd/postgres_exporter/tests/user_queries_test.yaml new file mode 100644 index 00000000..c9a39655 --- /dev/null +++ b/cmd/postgres_exporter/tests/user_queries_test.yaml @@ -0,0 +1,51 @@ +random: + query: | + WITH data AS (SELECT floor(random()*10) AS d FROM generate_series(1,100)), + metrics AS (SELECT SUM(d) AS sum, COUNT(*) AS count FROM data), + buckets AS (SELECT le, SUM(CASE WHEN d <= le THEN 1 ELSE 0 END) AS d + FROM data, UNNEST(ARRAY[1, 2, 4, 8]) AS le GROUP BY le) + SELECT + sum AS histogram_sum, + count AS histogram_count, + ARRAY_AGG(le) AS histogram, + ARRAY_AGG(d) AS histogram_bucket, + ARRAY_AGG(le) AS missing, + ARRAY_AGG(le) AS missing_sum, + ARRAY_AGG(d) AS missing_sum_bucket, + ARRAY_AGG(le) AS missing_count, + ARRAY_AGG(d) AS missing_count_bucket, + sum AS missing_count_sum, + ARRAY_AGG(le) AS unexpected_sum, + ARRAY_AGG(d) AS unexpected_sum_bucket, + 'data' AS unexpected_sum_sum, + ARRAY_AGG(le) AS unexpected_count, + ARRAY_AGG(d) AS unexpected_count_bucket, + sum AS unexpected_count_sum, + 'nan'::varchar AS unexpected_count_count, + ARRAY_AGG(le) AS unexpected_bytes, + ARRAY_AGG(d) AS unexpected_bytes_bucket, + sum AS unexpected_bytes_sum, + 'nan'::bytea AS unexpected_bytes_count + FROM metrics, buckets GROUP BY 1,2 + metrics: + - histogram: + usage: "HISTOGRAM" + description: "Random data" + - missing: + usage: "HISTOGRAM" + description: "nonfatal error" + - missing_sum: + usage: "HISTOGRAM" + description: "nonfatal error" + - missing_count: + usage: "HISTOGRAM" + description: "nonfatal error" + - unexpected_sum: + usage: "HISTOGRAM" + description: "nonfatal error" + - unexpected_count: + usage: "HISTOGRAM" + description: "nonfatal error" + - unexpected_bytes: + usage: "HISTOGRAM" + description: "nonfatal error" diff --git a/queries.yaml b/queries.yaml index 18abd65b..800e62bd 100644 --- a/queries.yaml +++ b/queries.yaml @@ -1,5 +1,5 @@ pg_replication: - query: "SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp())) as lag" + query: "SELECT CASE WHEN NOT pg_is_in_recovery() THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))) END AS lag" master: true metrics: - lag: @@ -15,7 +15,32 @@ pg_postmaster: description: "Time at which postmaster started" pg_stat_user_tables: - query: "SELECT current_database() datname, schemaname, relname, seq_scan, seq_tup_read, idx_scan, idx_tup_fetch, n_tup_ins, n_tup_upd, n_tup_del, n_tup_hot_upd, n_live_tup, n_dead_tup, n_mod_since_analyze, COALESCE(last_vacuum, '1970-01-01Z'), COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, COALESCE(last_analyze, '1970-01-01Z') as last_analyze, COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, vacuum_count, autovacuum_count, analyze_count, autoanalyze_count FROM pg_stat_user_tables" + query: | + SELECT + current_database() datname, + schemaname, + relname, + seq_scan, + seq_tup_read, + idx_scan, + idx_tup_fetch, + n_tup_ins, + n_tup_upd, + n_tup_del, + n_tup_hot_upd, + n_live_tup, + n_dead_tup, + n_mod_since_analyze, + COALESCE(last_vacuum, '1970-01-01Z') as last_vacuum, + COALESCE(last_autovacuum, '1970-01-01Z') as last_autovacuum, + COALESCE(last_analyze, '1970-01-01Z') as last_analyze, + COALESCE(last_autoanalyze, '1970-01-01Z') as last_autoanalyze, + vacuum_count, + autovacuum_count, + analyze_count, + autoanalyze_count + FROM + pg_stat_user_tables metrics: - datname: usage: "LABEL" @@ -203,3 +228,47 @@ pg_stat_statements: - blk_write_time_seconds: usage: "COUNTER" description: "Total time the statement spent writing blocks, in milliseconds (if track_io_timing is enabled, otherwise zero)" + +pg_stat_activity: + query: | + WITH + metrics AS ( + SELECT + application_name, + SUM(EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change))::bigint)::float AS process_idle_seconds_sum, + COUNT(*) AS process_idle_seconds_count + FROM pg_stat_activity + WHERE state = 'idle' + GROUP BY application_name + ), + buckets AS ( + SELECT + application_name, + le, + SUM( + CASE WHEN EXTRACT(EPOCH FROM (CURRENT_TIMESTAMP - state_change)) <= le + THEN 1 + ELSE 0 + END + )::bigint AS bucket + FROM + pg_stat_activity, + UNNEST(ARRAY[1, 2, 5, 15, 30, 60, 90, 120, 300]) AS le + GROUP BY application_name, le + ORDER BY application_name, le + ) + SELECT + application_name, + process_idle_seconds_sum, + process_idle_seconds_count, + ARRAY_AGG(le) AS process_idle_seconds, + ARRAY_AGG(bucket) AS process_idle_seconds_bucket + FROM metrics JOIN buckets USING (application_name) + GROUP BY 1, 2, 3 + metrics: + - application_name: + usage: "LABEL" + description: "Application Name" + - process_idle_seconds: + usage: "HISTOGRAM" + description: "Idle time of server processes"