article

Grafana + Prometheus ที่เปลี่ยนจากคิดว่ารู้เป็นรู้จริง

17 min read

วันที่ไม่รู้ว่าระบบเป็นอะไร

วันหนึ่งมีปัญหาแปลกๆ เกิดขึ้น: response time ช้าบางครั้ง

  • บางครั้ง API ตอบกลับใน 200ms (ปกติ)
  • บางครั้ง ใช้เวลา 5-8 วินาที (แย่มาก)
  • ไม่มี pattern ชัดเจน - random ทั้งนั้น

วิธีการ debug สมัยยังไม่รู้จัก monitoring:

# ดู logs แบบ manual
tail -f application.log | grep "ERROR"

# Check server resources
top
htop

# ดู database slow query
mysql> SHOW PROCESSLIST;

# Check disk space
df -h

ปัญหา: ข้อมูลทั้งหมดเป็น point-in-time ไม่สามารถดู trends หรือ correlation ได้! 😩

พอเจอปัญหาจริง เราถึงจะเข้าใจว่า “ถ้าไม่วัดไม่ได้ แสดงว่าไม่รู้จริง”

จึงเริ่มเรียนรู้ Prometheus + Grafana อย่างจริงจัง! 📊

การเริ่มต้นกับ Prometheus

1. Basic Prometheus Setup

# prometheus.yml - Configuration พื้นฐาน
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "rules/*.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['localhost:9100']

  - job_name: 'my-app'
    static_configs:
      - targets: ['localhost:3000']
    metrics_path: '/metrics'
    scrape_interval: 5s
# docker-compose.yml - Setup แบบง่าย
version: '3.8'
services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'

  grafana:
    image: grafana/grafana:latest
    container_name: grafana
    ports:
      - "3001:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
    volumes:
      - grafana_data:/var/lib/grafana

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"

volumes:
  prometheus_data:
  grafana_data:

2. Application Metrics Integration

// metrics.js - Express app integration
const express = require('express');
const promClient = require('prom-client');

// สร้าง default metrics registry
const register = promClient.register;

// Collect default metrics
promClient.collectDefaultMetrics({ register });

// Custom metrics
const httpRequestDuration = new promClient.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5]
});

const httpRequestTotal = new promClient.Counter({
  name: 'http_requests_total', 
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status']
});

const activeConnections = new promClient.Gauge({
  name: 'nodejs_active_connections',
  help: 'Number of active connections'
});

const databaseQueryDuration = new promClient.Histogram({
  name: 'database_query_duration_seconds',
  help: 'Database query execution time',
  labelNames: ['query_type', 'table'],
  buckets: [0.001, 0.01, 0.1, 0.5, 1, 2, 5]
});

// Register custom metrics
register.registerMetric(httpRequestDuration);
register.registerMetric(httpRequestTotal);  
register.registerMetric(activeConnections);
register.registerMetric(databaseQueryDuration);

module.exports = {
  register,
  httpRequestDuration,
  httpRequestTotal,
  activeConnections, 
  databaseQueryDuration
};
// middleware.js - Express monitoring middleware
const { 
  httpRequestDuration, 
  httpRequestTotal,
  activeConnections
} = require('./metrics');

function metricsMiddleware(req, res, next) {
  const startTime = Date.now();
  
  // Track active connections
  activeConnections.inc();
  
  res.on('finish', () => {
    const duration = (Date.now() - startTime) / 1000;
    const route = req.route ? req.route.path : req.path;
    
    // Record metrics
    httpRequestDuration
      .labels(req.method, route, res.statusCode)
      .observe(duration);
      
    httpRequestTotal
      .labels(req.method, route, res.statusCode)
      .inc();
      
    activeConnections.dec();
  });
  
  next();
}

module.exports = metricsMiddleware;
// app.js - Main application
const express = require('express');
const { register } = require('./metrics');
const metricsMiddleware = require('./middleware');

const app = express();

// Add metrics middleware ทุก request
app.use(metricsMiddleware);

// Metrics endpoint สำหรับ Prometheus scraping
app.get('/metrics', (req, res) => {
  res.set('Content-Type', register.contentType);
  res.end(register.metrics());
});

// Business routes
app.get('/api/products', async (req, res) => {
  try {
    const products = await fetchProducts(); // Your business logic
    res.json(products);
  } catch (error) {
    res.status(500).json({ error: error.message });
  }
});

app.listen(3000, () => {
  console.log('Server running on port 3000');
  console.log('Metrics available at http://localhost:3000/metrics');
});

Database Monitoring Integration

// db-metrics.js - Database monitoring
const mysql = require('mysql2/promise');
const { databaseQueryDuration } = require('./metrics');

class DatabaseWithMetrics {
  constructor(config) {
    this.pool = mysql.createPool(config);
  }
  
  async query(sql, params = [], metadata = {}) {
    const startTime = Date.now();
    const queryType = this.getQueryType(sql);
    const table = metadata.table || this.extractTable(sql);
    
    try {
      const [results] = await this.pool.execute(sql, params);
      
      // Record successful query metrics
      const duration = (Date.now() - startTime) / 1000;
      databaseQueryDuration
        .labels(queryType, table)
        .observe(duration);
      
      return results;
    } catch (error) {
      // Record failed query metrics  
      const duration = (Date.now() - startTime) / 1000;
      databaseQueryDuration
        .labels(`${queryType}_error`, table)
        .observe(duration);
      
      throw error;
    }
  }
  
  getQueryType(sql) {
    const normalized = sql.trim().toLowerCase();
    if (normalized.startsWith('select')) return 'select';
    if (normalized.startsWith('insert')) return 'insert';
    if (normalized.startsWith('update')) return 'update';
    if (normalized.startsWith('delete')) return 'delete';
    return 'other';
  }
  
  extractTable(sql) {
    // Simple table extraction (could be more sophisticated)
    const match = sql.match(/(?:from|into|update|join)\s+`?(\w+)`?/i);
    return match ? match[1] : 'unknown';
  }
  
  // Connection pool metrics
  getPoolMetrics() {
    return {
      totalConnections: this.pool.pool._allConnections.length,
      freeConnections: this.pool.pool._freeConnections.length,
      acquiringConnections: this.pool.pool._acquiringConnections.length
    };
  }
}

// Usage
const db = new DatabaseWithMetrics({
  host: 'localhost',
  user: 'root',
  password: 'password',
  database: 'myapp',
  waitForConnections: true,
  connectionLimit: 10,
  queueLimit: 0
});

// Add pool metrics
const dbPoolGauge = new promClient.Gauge({
  name: 'mysql_pool_connections',
  help: 'MySQL connection pool status',
  labelNames: ['state'],
  collect() {
    const metrics = db.getPoolMetrics();
    this.set({ state: 'total' }, metrics.totalConnections);
    this.set({ state: 'free' }, metrics.freeConnections); 
    this.set({ state: 'acquiring' }, metrics.acquiringConnections);
  }
});

module.exports = db;

Advanced Metrics Collection

1. Business Logic Metrics

// business-metrics.js - Track business events
const promClient = require('prom-client');

const orderMetrics = {
  // Order counters
  ordersTotal: new promClient.Counter({
    name: 'orders_total',
    help: 'Total number of orders',
    labelNames: ['status', 'payment_method']
  }),
  
  // Order value
  orderValue: new promClient.Histogram({
    name: 'order_value_dollars',
    help: 'Order value in dollars',
    labelNames: ['category'],
    buckets: [10, 50, 100, 500, 1000, 5000]
  }),
  
  // Processing time
  orderProcessingTime: new promClient.Histogram({
    name: 'order_processing_seconds',
    help: 'Time to process an order',
    labelNames: ['step'],
    buckets: [0.1, 0.5, 1, 2, 5, 10]
  }),
  
  // Current active orders
  activeOrders: new promClient.Gauge({
    name: 'active_orders_current',
    help: 'Currently active orders',
    labelNames: ['status']
  })
};

class OrderService {
  constructor() {
    this.orders = new Map();
  }
  
  async createOrder(orderData) {
    const startTime = Date.now();
    
    try {
      // Validate order (track validation time)
      const validationStart = Date.now();
      await this.validateOrder(orderData);
      orderMetrics.orderProcessingTime
        .labels('validation')
        .observe((Date.now() - validationStart) / 1000);
      
      // Process payment (track payment time)
      const paymentStart = Date.now();
      const payment = await this.processPayment(orderData);
      orderMetrics.orderProcessingTime
        .labels('payment')
        .observe((Date.now() - paymentStart) / 1000);
      
      // Create order
      const order = await this.saveOrder(orderData, payment);
      
      // Record metrics
      orderMetrics.ordersTotal
        .labels('created', orderData.paymentMethod)
        .inc();
        
      orderMetrics.orderValue
        .labels(orderData.category)
        .observe(orderData.total);
        
      orderMetrics.orderProcessingTime
        .labels('total')
        .observe((Date.now() - startTime) / 1000);
        
      orderMetrics.activeOrders
        .labels('pending')
        .inc();
      
      return order;
    } catch (error) {
      // Track failed orders
      orderMetrics.ordersTotal
        .labels('failed', orderData.paymentMethod)
        .inc();
        
      throw error;
    }
  }
  
  async fulfillOrder(orderId) {
    try {
      await this.shipOrder(orderId);
      
      orderMetrics.ordersTotal
        .labels('fulfilled', 'unknown')
        .inc();
        
      orderMetrics.activeOrders
        .labels('pending')
        .dec();
        
      orderMetrics.activeOrders
        .labels('fulfilled')
        .inc();
        
    } catch (error) {
      orderMetrics.ordersTotal
        .labels('fulfillment_failed', 'unknown')
        .inc();
      
      throw error;
    }
  }
}

module.exports = { OrderService, orderMetrics };

2. External Service Monitoring

// external-service-metrics.js
const axios = require('axios');
const promClient = require('prom-client');

const externalServiceMetrics = {
  requestDuration: new promClient.Histogram({
    name: 'external_service_request_duration_seconds',
    help: 'Duration of external service requests',
    labelNames: ['service', 'endpoint', 'status'],
    buckets: [0.1, 0.5, 1, 2, 5, 10, 30]
  }),
  
  requestsTotal: new promClient.Counter({
    name: 'external_service_requests_total',
    help: 'Total external service requests',
    labelNames: ['service', 'endpoint', 'status']
  }),
  
  circuitBreakerState: new promClient.Gauge({
    name: 'circuit_breaker_state',
    help: 'Circuit breaker state (0=closed, 1=open, 2=half-open)',
    labelNames: ['service']
  })
};

class ExternalServiceClient {
  constructor(serviceName, baseURL) {
    this.serviceName = serviceName;
    this.client = axios.create({
      baseURL,
      timeout: 10000
    });
    
    this.setupInterceptors();
  }
  
  setupInterceptors() {
    // Request interceptor
    this.client.interceptors.request.use(
      (config) => {
        config.metadata = { startTime: Date.now() };
        return config;
      },
      (error) => Promise.reject(error)
    );
    
    // Response interceptor  
    this.client.interceptors.response.use(
      (response) => {
        this.recordMetrics(response.config, response.status);
        return response;
      },
      (error) => {
        const status = error.response ? error.response.status : 'network_error';
        this.recordMetrics(error.config, status);
        return Promise.reject(error);
      }
    );
  }
  
  recordMetrics(config, status) {
    if (!config.metadata) return;
    
    const duration = (Date.now() - config.metadata.startTime) / 1000;
    const endpoint = this.getEndpointLabel(config.url);
    
    externalServiceMetrics.requestDuration
      .labels(this.serviceName, endpoint, status)
      .observe(duration);
      
    externalServiceMetrics.requestsTotal
      .labels(this.serviceName, endpoint, status)
      .inc();
  }
  
  getEndpointLabel(url) {
    // Normalize URL to remove IDs and query params
    return url
      .replace(/\/\d+/g, '/:id')
      .replace(/\?.*/, '')
      .replace(/^\//, '');
  }
  
  async get(url, config = {}) {
    return await this.client.get(url, config);
  }
  
  async post(url, data, config = {}) {
    return await this.client.post(url, data, config);
  }
}

// Usage examples
const paymentService = new ExternalServiceClient('payment-gateway', 'https://api.payment.com');
const emailService = new ExternalServiceClient('email-service', 'https://api.email.com');

// Use in business logic
async function processPayment(paymentData) {
  try {
    const response = await paymentService.post('/charges', paymentData);
    return response.data;
  } catch (error) {
    console.error('Payment failed:', error);
    throw error;
  }
}

Grafana Dashboard Configuration

1. Dashboard สำหรับ Application Performance

// grafana-dashboard-app.json
{
  "dashboard": {
    "title": "Application Performance Dashboard",
    "panels": [
      {
        "title": "Request Rate (RPS)",
        "type": "graph", 
        "targets": [
          {
            "expr": "sum(rate(http_requests_total[5m])) by (method)",
            "legendFormat": "{{method}}"
          }
        ],
        "yAxes": [
          {
            "label": "Requests/sec"
          }
        ]
      },
      {
        "title": "Response Time Percentiles",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
            "legendFormat": "50th percentile"
          },
          {
            "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))",
            "legendFormat": "99th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m])) * 100",
            "legendFormat": "Error Rate %"
          }
        ],
        "thresholds": [
          {
            "value": 1,
            "color": "yellow"  
          },
          {
            "value": 5,
            "color": "red"
          }
        ]
      }
    ]
  }
}

2. Database Performance Dashboard

// grafana-dashboard-db.json  
{
  "dashboard": {
    "title": "Database Performance",
    "panels": [
      {
        "title": "Query Duration by Type",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, sum(rate(database_query_duration_seconds_bucket[5m])) by (query_type, le))",
            "legendFormat": "{{query_type}} (95th)"
          }
        ]
      },
      {
        "title": "Slow Queries (>1s)",
        "type": "graph", 
        "targets": [
          {
            "expr": "sum(rate(database_query_duration_seconds_bucket{le=\"1\"}[5m])) by (table)",
            "legendFormat": "{{table}}"
          }
        ]
      },
      {
        "title": "Connection Pool Status",
        "type": "graph",
        "targets": [
          {
            "expr": "mysql_pool_connections{state=\"free\"}",
            "legendFormat": "Free Connections"
          },
          {
            "expr": "mysql_pool_connections{state=\"total\"}",
            "legendFormat": "Total Connections"
          }
        ]
      }
    ]
  }
}

3. Business Metrics Dashboard

// grafana-dashboard-business.json
{
  "dashboard": {
    "title": "Business Metrics",
    "panels": [
      {
        "title": "Orders per Hour",
        "type": "graph",
        "targets": [
          {
            "expr": "sum(increase(orders_total{status=\"created\"}[1h]))",
            "legendFormat": "New Orders"
          },
          {
            "expr": "sum(increase(orders_total{status=\"fulfilled\"}[1h]))",
            "legendFormat": "Fulfilled Orders"
          }
        ]
      },
      {
        "title": "Revenue by Category",
        "type": "pie",
        "targets": [
          {
            "expr": "sum(increase(order_value_dollars_sum[24h])) by (category)",
            "legendFormat": "{{category}}"
          }
        ]
      },
      {
        "title": "Average Order Value",
        "type": "singlestat",
        "targets": [
          {
            "expr": "sum(increase(order_value_dollars_sum[1h])) / sum(increase(order_value_dollars_count[1h]))",
            "legendFormat": "AOV"
          }
        ],
        "format": "currency"
      }
    ]
  }
}

Alerting Rules Configuration

1. Critical Alerts

# alerts/critical.yml
groups:
- name: critical-alerts
  rules:
  - alert: HighErrorRate
    expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes"

  - alert: HighResponseTime
    expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 2
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High response time"
      description: "95th percentile response time is {{ $value }}s"

  - alert: DatabaseSlowQueries
    expr: histogram_quantile(0.95, sum(rate(database_query_duration_seconds_bucket[5m])) by (le)) > 5
    for: 3m
    labels:
      severity: warning
    annotations:
      summary: "Slow database queries detected"
      description: "95th percentile query time is {{ $value }}s"

  - alert: ServiceDown
    expr: up{job="my-app"} == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Service is down"
      description: "{{ $labels.instance }} has been down for more than 1 minute"

2. Business Alerts

# alerts/business.yml
groups:
- name: business-alerts
  rules:
  - alert: LowOrderVolume
    expr: sum(increase(orders_total{status="created"}[1h])) < 10
    for: 30m
    labels:
      severity: warning
    annotations:
      summary: "Low order volume"
      description: "Only {{ $value }} orders in the last hour"

  - alert: HighOrderFailureRate  
    expr: sum(rate(orders_total{status="failed"}[10m])) / sum(rate(orders_total[10m])) > 0.1
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "High order failure rate"
      description: "Order failure rate is {{ $value | humanizePercentage }}"

  - alert: ExternalServiceDown
    expr: sum(rate(external_service_requests_total{status="network_error"}[5m])) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "External service issues"
      description: "{{ $labels.service }} is experiencing connectivity issues"

Advanced PromQL Queries

1. Performance Analysis Queries

# Rate of requests per second by status code
sum(rate(http_requests_total[5m])) by (status)

# 95th percentile response time over time
histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))

# Error rate percentage
sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100

# Top 5 slowest endpoints
topk(5, histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (route, le)))

# Request rate vs capacity
sum(rate(http_requests_total[5m])) / on() (nodejs_active_connections + 1) * 100

2. Resource Usage Queries

# Memory usage percentage
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100

# CPU usage over time
100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

# Disk I/O rate
rate(node_disk_io_time_seconds_total[5m])

# Network throughput
rate(node_network_receive_bytes_total[5m]) + rate(node_network_transmit_bytes_total[5m])

# Database connection pool utilization
(mysql_pool_connections{state="total"} - mysql_pool_connections{state="free"}) / mysql_pool_connections{state="total"} * 100

3. Business Intelligence Queries

# Revenue per hour
sum(increase(order_value_dollars_sum[1h]))

# Conversion rate (orders vs page views)
sum(increase(orders_total{status="created"}[1h])) / sum(increase(http_requests_total{route="/"}[1h])) * 100

# Average order processing time
sum(rate(order_processing_seconds_sum[5m])) / sum(rate(order_processing_seconds_count[5m]))

# Most popular products (by order volume)
topk(10, sum(increase(orders_total{status="created"}[24h])) by (product_id))

# Customer acquisition cost trend
sum(increase(marketing_spend_dollars[1h])) / sum(increase(orders_total{status="created"}[1h]))

Real-world Monitoring Scenarios

1. Incident Response Runbook

// monitoring-runbook.js
class IncidentResponse {
  constructor(grafanaAPI, slackAPI) {
    this.grafana = grafanaAPI;
    this.slack = slackAPI;
    this.incidents = new Map();
  }
  
  async handleAlert(alert) {
    const incident = {
      id: this.generateIncidentId(),
      alert,
      startTime: new Date(),
      status: 'investigating',
      assignee: null,
      timeline: []
    };
    
    this.incidents.set(incident.id, incident);
    
    // Create Grafana annotation
    await this.grafana.createAnnotation({
      time: incident.startTime.getTime(),
      text: `🚨 INCIDENT: ${alert.alertname}`,
      tags: ['incident', alert.severity]
    });
    
    // Send Slack notification
    await this.slack.sendMessage({
      channel: '#incidents',
      text: `🚨 *${alert.severity.toUpperCase()}*: ${alert.summary}`,
      attachments: [
        {
          title: 'Incident Details',
          fields: [
            { title: 'Alert', value: alert.alertname },
            { title: 'Description', value: alert.description },
            { title: 'Runbook', value: this.getRunbookURL(alert) }
          ],
          color: alert.severity === 'critical' ? 'danger' : 'warning'
        }
      ]
    });
    
    // Auto-assign based on alert type
    incident.assignee = this.getOnCallEngineer(alert);
    
    return incident;
  }
  
  getRunbookURL(alert) {
    const runbooks = {
      'HighErrorRate': 'https://wiki.company.com/runbooks/high-error-rate',
      'HighResponseTime': 'https://wiki.company.com/runbooks/performance',
      'ServiceDown': 'https://wiki.company.com/runbooks/service-down'
    };
    
    return runbooks[alert.alertname] || 'https://wiki.company.com/runbooks/general';
  }
  
  async resolveIncident(incidentId, resolution) {
    const incident = this.incidents.get(incidentId);
    if (!incident) return;
    
    incident.status = 'resolved';
    incident.resolvedTime = new Date();
    incident.resolution = resolution;
    
    const duration = incident.resolvedTime - incident.startTime;
    
    // Update Grafana annotation
    await this.grafana.createAnnotation({
      time: incident.resolvedTime.getTime(), 
      text: `✅ RESOLVED: ${incident.alert.alertname} (${this.formatDuration(duration)})`,
      tags: ['incident-resolved']
    });
    
    // Notify Slack
    await this.slack.sendMessage({
      channel: '#incidents',
      text: `✅ *RESOLVED*: ${incident.alert.summary}`,
      attachments: [
        {
          title: 'Resolution Details',
          fields: [
            { title: 'Duration', value: this.formatDuration(duration) },
            { title: 'Resolved by', value: incident.assignee },
            { title: 'Resolution', value: resolution }
          ],
          color: 'good'
        }
      ]
    });
  }
}

2. Performance Regression Detection

// regression-detector.js
class PerformanceRegressionDetector {
  constructor(prometheus) {
    this.prometheus = prometheus;
    this.baselines = new Map();
  }
  
  async detectRegressions() {
    const metrics = [
      'http_request_duration_seconds',
      'database_query_duration_seconds', 
      'external_service_request_duration_seconds'
    ];
    
    for (const metric of metrics) {
      await this.checkMetricRegression(metric);
    }
  }
  
  async checkMetricRegression(metric) {
    // Get current week average
    const currentWeekQuery = `avg_over_time(${metric}[7d])`;
    const currentWeek = await this.prometheus.query(currentWeekQuery);
    
    // Get previous week average 
    const previousWeekQuery = `avg_over_time(${metric}[7d] offset 7d)`;
    const previousWeek = await this.prometheus.query(previousWeekQuery);
    
    if (currentWeek.length === 0 || previousWeek.length === 0) return;
    
    const currentAvg = parseFloat(currentWeek[0].value[1]);
    const previousAvg = parseFloat(previousWeek[0].value[1]);
    
    const changePercent = ((currentAvg - previousAvg) / previousAvg) * 100;
    
    // Alert if performance degraded by more than 20%
    if (changePercent > 20) {
      await this.reportRegression({
        metric,
        currentValue: currentAvg,
        previousValue: previousAvg,
        changePercent,
        severity: changePercent > 50 ? 'critical' : 'warning'
      });
    }
  }
  
  async reportRegression(regression) {
    console.log(`🔍 Performance regression detected:`);
    console.log(`   Metric: ${regression.metric}`);  
    console.log(`   Current: ${regression.currentValue.toFixed(3)}s`);
    console.log(`   Previous: ${regression.previousValue.toFixed(3)}s`);
    console.log(`   Change: +${regression.changePercent.toFixed(1)}%`);
    
    // Could integrate with alerting system
    // await this.sendAlert(regression);
  }
}

3. Capacity Planning Dashboard

// capacity-planner.js
class CapacityPlanner {
  constructor(prometheus, grafana) {
    this.prometheus = prometheus;
    this.grafana = grafana;
  }
  
  async generateCapacityReport() {
    const report = {
      timestamp: new Date(),
      predictions: {},
      recommendations: []
    };
    
    // Predict CPU usage growth
    const cpuGrowth = await this.predictGrowth(
      'avg(100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100))',
      30 // 30 days
    );
    
    report.predictions.cpu = cpuGrowth;
    
    if (cpuGrowth.projectedValue > 80) {
      report.recommendations.push({
        type: 'scale_up',
        resource: 'CPU',
        urgency: 'high',
        description: 'CPU usage projected to exceed 80% in 30 days'
      });
    }
    
    // Predict memory usage growth
    const memoryGrowth = await this.predictGrowth(
      'avg((1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100)',
      30
    );
    
    report.predictions.memory = memoryGrowth;
    
    // Predict request rate growth
    const requestGrowth = await this.predictGrowth(
      'sum(rate(http_requests_total[5m]))',
      30
    );
    
    report.predictions.requestRate = requestGrowth;
    
    // Calculate when we'll need to scale
    const currentCapacity = await this.getCurrentCapacity();
    const daysToCapacity = this.calculateDaysToCapacity(requestGrowth, currentCapacity);
    
    if (daysToCapacity < 60) {
      report.recommendations.push({
        type: 'scale_out',
        resource: 'instances', 
        urgency: daysToCapacity < 30 ? 'high' : 'medium',
        description: `Need additional capacity in ${daysToCapacity} days`
      });
    }
    
    return report;
  }
  
  async predictGrowth(query, days) {
    // Get historical data points
    const dataPoints = await this.prometheus.queryRange(
      query,
      new Date(Date.now() - (days * 24 * 60 * 60 * 1000)),
      new Date(),
      '1d'
    );
    
    if (dataPoints.length < 7) {
      return { error: 'Insufficient data for prediction' };
    }
    
    // Simple linear regression
    const regression = this.linearRegression(dataPoints);
    const projectedValue = regression.slope * days + regression.intercept;
    
    return {
      currentValue: dataPoints[dataPoints.length - 1].value,
      projectedValue,
      growthRate: regression.slope,
      confidence: regression.r2
    };
  }
  
  linearRegression(data) {
    const n = data.length;
    const sumX = data.reduce((sum, point, i) => sum + i, 0);
    const sumY = data.reduce((sum, point) => sum + parseFloat(point.value[1]), 0);
    const sumXY = data.reduce((sum, point, i) => sum + (i * parseFloat(point.value[1])), 0);
    const sumXX = data.reduce((sum, point, i) => sum + (i * i), 0);
    
    const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX);
    const intercept = (sumY - slope * sumX) / n;
    
    // Calculate R-squared
    const yMean = sumY / n;
    const ssRes = data.reduce((sum, point, i) => {
      const predicted = slope * i + intercept;
      const actual = parseFloat(point.value[1]);
      return sum + Math.pow(actual - predicted, 2);
    }, 0);
    
    const ssTot = data.reduce((sum, point) => {
      const actual = parseFloat(point.value[1]);
      return sum + Math.pow(actual - yMean, 2);
    }, 0);
    
    const r2 = 1 - (ssRes / ssTot);
    
    return { slope, intercept, r2 };
  }
}

เคสจริง: ไขปัญหา Performance ด้วย Monitoring

ปัญหา: Random Slow Response

หลังจากติดตั้ง Grafana + Prometheus ได้ประมาณ 2 สัปดาห์ เจอปัญหาเก่าอีกครั้ง: response time ช้าแบบ random

แต่คราวนี้มีข้อมูลให้ดู! 📊

ขั้นตอนการวิเคราะห์:

  1. ดู Response Time Dashboard:

    • 95th percentile ปกติ: ~200ms
    • เวลาช้า: พุ่งไป 8-10 วินาที
    • Pattern: เกิดทุก 15-20 นาที
  2. ดู Database Metrics:

    • Query duration spike ตรงกับ response time spike
    • Connection pool: free connections ลดลงเป็น 0
    • Slow query: ตรวจพบ query ที่ใช้เวลา >5 วินาที
  3. ตรวจสอบ Business Metrics:

    • Order processing time spike ตรงเวลาเดียวกัน
    • External service metrics: Payment gateway timeout

Root Cause ที่พบ:

-- Query ที่ทำปัญหา (ไม่มี index ที่เหมาะสม)
SELECT p.*, COUNT(o.id) as order_count 
FROM products p 
LEFT JOIN orders o ON p.id = o.product_id 
WHERE o.created_at >= NOW() - INTERVAL 1 HOUR 
GROUP BY p.id 
ORDER BY order_count DESC 
LIMIT 10;

การแก้ไข:

  1. เพิ่ม index:

    CREATE INDEX idx_orders_created_product ON orders(created_at, product_id);
    
  2. Cache query results:

    // Cache popular products query
    const popularProducts = await cache.getOrSet(
      'popular_products_hourly',
      () => db.getPopularProducts(),
      900 // 15 minutes
    );
    
  3. Set query timeout:

    // Add query timeout
    const results = await Promise.race([
      db.query(sql, params),
      new Promise((_, reject) => 
        setTimeout(() => reject(new Error('Query timeout')), 5000)
      )
    ]);
    

ผลลัพธ์หลังแก้ไข:

  • Response time เสถียร ~50ms (ดีขึ้น 4 เท่า)
  • Database query time <100ms consistent
  • Connection pool utilization <50%
  • Zero timeouts

บทเรียนสำคัญ: ถ้าไม่มี monitoring เราจะไม่มีทางรู้ว่าปัญหาอยู่ตรงไหน! 🎯

สรุป: Monitoring ที่เปลี่ยนชีวิต Developer

ก่อนมี Proper Monitoring:

  • Debug ด้วยการเดา 🎲
  • ไม่รู้ root cause ของปัญหา
  • Reactive แทน proactive
  • Performance issues ค่อยรู้ตัวเมื่อ user บ่น

หลังมี Grafana + Prometheus:

  • Data-driven decisions ทุกการตัดสินใจ
  • Proactive monitoring รู้ปัญหาก่อน user
  • Root cause analysis ได้ข้อมูลเพียงพอ
  • Capacity planning วางแผนได้ล่วงหน้า

ข้อดีที่ได้จริง:

  • MTTR ลดลง 80% (Mean Time To Recovery)
  • Incident prevention หยุดปัญหาก่อนเกิด
  • Performance optimization แบบมีหลักฐาน
  • Business insight เข้าใจ user behavior จริงๆ

Best Practices ที่เรียนรู้:

  • Monitor everything แต่ alert เฉพาะสิ่งสำคัญ
  • SLI/SLO approach กำหนด Service Level objectives ชัดเจน
  • Runbooks เขียนวิธีแก้ปัญหาไว้ล่วงหน้า
  • Dashboard hygiene จัดการ dashboard ให้เป็นระเบียบ

Anti-patterns ที่หลีกเลี่ยง:

  • Alert fatigue (alert เยอะเกินไป)
  • Vanity metrics (metrics ที่ดูดีแต่ไม่มีประโยชน์)
  • Over-engineering monitoring
  • ไม่ train team ให้ใช้ tools เป็น

Prometheus + Grafana เหมือนกล้องจุลทรรศน์ สำหรับระบบ

มันทำให้เราเห็นสิ่งที่มองไม่เห็น และเข้าใจพฤติกรรมที่ซับซ้อนของ application

ตอนนี้ไม่สามารถจินตนาการได้เลยว่าจะทำงานโดยไม่มี monitoring!

เพราะมันเปลี่ยนวิธีคิดจาก “คิดว่ารู้” เป็น “รู้จริง”! 📈💡