article

OpenTelemetry - ตาและหูของระบบ Microservices

16 min read

OpenTelemetry คืออะไร?

OpenTelemetry (OTel) เป็น Observability Framework แบบ Open-source ที่ช่วยให้เราเก็บ ประมวลผล และส่งออก Telemetry Data (Traces, Metrics, Logs) จากแอปพลิเคชัน

The Three Pillars of Observability

graph LR
    A[Traces] --> D[Complete Observability]
    B[Metrics] --> D
    C[Logs] --> D
    
    A --> A1["What happened?<br/>Request flow<br/>Performance bottlenecks"]
    B --> B1["How much?<br/>System health<br/>Resource usage"]
    C --> C1["Why did it happen?<br/>Error details<br/>Context information"]

Core Concepts

1. Traces และ Spans

Trace คือการติดตาม Request ตั้งแต่เริ่มต้นจนจบ
Span คือหน่วยงานย่อยภายใน Trace

// Node.js Example
const { trace } = require('@opentelemetry/api');

async function processOrder(orderId) {
  // Create a span
  const span = trace.getActiveTracer().startSpan('process-order');
  
  try {
    // Add attributes to span
    span.setAttributes({
      'order.id': orderId,
      'user.id': getCurrentUserId(),
      'service.version': process.env.SERVICE_VERSION
    });

    // Child operations create child spans automatically
    const user = await getUserById(span.getSpanContext(), userId);
    const inventory = await checkInventory(orderId);
    const payment = await processPayment(orderId, user.paymentMethod);
    
    // Add events to span
    span.addEvent('order-validated', {
      'validation.result': 'success',
      'inventory.available': inventory.quantity
    });

    span.setStatus({ code: SpanStatusCode.OK });
    return { success: true, orderId };
    
  } catch (error) {
    // Record exceptions
    span.recordException(error);
    span.setStatus({ 
      code: SpanStatusCode.ERROR, 
      message: error.message 
    });
    throw error;
  } finally {
    span.end();
  }
}

2. Metrics

Metrics คือข้อมูลตัวเลขที่วัดได้ เช่น Counter, Gauge, Histogram

const { metrics } = require('@opentelemetry/api');

class OrderMetrics {
  constructor() {
    const meter = metrics.getMeter('order-service', '1.0.0');
    
    // Counter - จำนวนที่เพิ่มขึ้นเรื่อยๆ
    this.ordersTotal = meter.createCounter('orders_total', {
      description: 'Total number of orders processed'
    });
    
    // Gauge - ค่าที่เปลี่ยนแปลงได้
    this.activeConnections = meter.createUpDownCounter('active_connections', {
      description: 'Number of active database connections'
    });
    
    // Histogram - การแจกแจงของค่า
    this.requestDuration = meter.createHistogram('request_duration_ms', {
      description: 'Request duration in milliseconds',
      unit: 'ms'
    });
  }

  recordOrderCreated(customerId, region, orderType) {
    this.ordersTotal.add(1, {
      customer_id: customerId,
      region: region,
      order_type: orderType,
      status: 'created'
    });
  }

  recordRequestDuration(duration, endpoint, method, statusCode) {
    this.requestDuration.record(duration, {
      endpoint: endpoint,
      method: method,
      status_code: statusCode.toString()
    });
  }
}

// การใช้งาน
const orderMetrics = new OrderMetrics();

app.use((req, res, next) => {
  const startTime = Date.now();
  
  res.on('finish', () => {
    const duration = Date.now() - startTime;
    orderMetrics.recordRequestDuration(
      duration,
      req.route?.path || req.path,
      req.method,
      res.statusCode
    );
  });
  
  next();
});

3. Logs (Structured Logging)

const { logs } = require('@opentelemetry/api');
const winston = require('winston');

class StructuredLogger {
  constructor() {
    this.logger = winston.createLogger({
      level: 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json()
      ),
      transports: [
        new winston.transports.Console(),
        new winston.transports.File({ filename: 'app.log' })
      ]
    });
  }

  logWithTrace(level, message, metadata = {}) {
    const span = trace.getActiveSpan();
    const spanContext = span?.spanContext();
    
    this.logger.log(level, message, {
      ...metadata,
      traceId: spanContext?.traceId,
      spanId: spanContext?.spanId,
      traceFlags: spanContext?.traceFlags,
      timestamp: new Date().toISOString(),
      service: 'order-service',
      version: process.env.SERVICE_VERSION
    });
  }

  info(message, metadata) {
    this.logWithTrace('info', message, metadata);
  }

  error(message, error, metadata = {}) {
    this.logWithTrace('error', message, {
      ...metadata,
      error: {
        name: error.name,
        message: error.message,
        stack: error.stack
      }
    });
  }
}

// การใช้งาน
const logger = new StructuredLogger();

async function processPayment(orderId, paymentMethod) {
  const span = trace.getActiveTracer().startSpan('process-payment');
  
  try {
    logger.info('Processing payment', {
      orderId,
      paymentMethod: paymentMethod.type,
      amount: paymentMethod.amount
    });

    const result = await paymentGateway.charge(paymentMethod);
    
    logger.info('Payment processed successfully', {
      orderId,
      transactionId: result.transactionId,
      amount: result.amount
    });
    
    return result;
  } catch (error) {
    logger.error('Payment processing failed', error, {
      orderId,
      paymentMethod: paymentMethod.type
    });
    throw error;
  } finally {
    span.end();
  }
}

OpenTelemetry Setup

Node.js Complete Setup

// tracing.js
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');

// Exporters
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-otlp-http');

// Create resource with service information
const resource = new Resource({
  [SemanticResourceAttributes.SERVICE_NAME]: 'order-service',
  [SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
  [SemanticResourceAttributes.SERVICE_INSTANCE_ID]: process.env.HOSTNAME || 'localhost',
  [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'development'
});

// Configure trace exporter
const traceExporter = process.env.NODE_ENV === 'production' 
  ? new OTLPTraceExporter({
      url: 'https://api.honeycomb.io/v1/traces',
      headers: {
        'x-honeycomb-team': process.env.HONEYCOMB_API_KEY
      }
    })
  : new JaegerExporter({
      endpoint: 'http://localhost:14268/api/traces'
    });

// Configure metrics exporter
const metricsExporter = new PrometheusExporter({
  port: 9464
});

// Initialize SDK
const sdk = new NodeSDK({
  resource,
  traceExporter,
  metricExporter: metricsExporter,
  instrumentations: [
    getNodeAutoInstrumentations({
      '@opentelemetry/instrumentation-fs': {
        enabled: false // ปิด file system instrumentation เพื่อลด noise
      },
      '@opentelemetry/instrumentation-http': {
        enabled: true,
        requestHook: (span, request) => {
          span.setAttributes({
            'http.request_id': request.headers['x-request-id'],
            'user.id': request.headers['x-user-id']
          });
        },
        responseHook: (span, response) => {
          span.setAttributes({
            'http.response.size': response.headers['content-length']
          });
        }
      },
      '@opentelemetry/instrumentation-express': {
        enabled: true
      },
      '@opentelemetry/instrumentation-mongodb': {
        enabled: true,
        enhancedDatabaseReporting: true
      },
      '@opentelemetry/instrumentation-redis': {
        enabled: true,
        dbStatementSerializer: (cmdName, cmdArgs) => {
          return `${cmdName} ${cmdArgs[0]}`; // แสดงเฉพาะ command และ key แรก
        }
      }
    })
  ]
});

// Start tracing before importing application
sdk.start();

// Graceful shutdown
process.on('SIGTERM', () => {
  sdk.shutdown()
    .then(() => console.log('Telemetry terminated'))
    .catch((error) => console.log('Error terminating telemetry', error))
    .finally(() => process.exit(0));
});

module.exports = sdk;
// app.js
// Import tracing FIRST, before any other modules
require('./tracing');

const express = require('express');
const { trace, context, propagation } = require('@opentelemetry/api');

const app = express();

// Custom instrumentation middleware
app.use((req, res, next) => {
  const span = trace.getActiveSpan();
  
  // Extract trace context from incoming headers
  const parentContext = propagation.extract(context.active(), req.headers);
  
  // Add custom attributes
  if (span) {
    span.setAttributes({
      'http.request.id': req.headers['x-request-id'] || generateRequestId(),
      'user.id': req.headers['x-user-id'],
      'http.user_agent': req.headers['user-agent'],
      'http.real_ip': req.headers['x-real-ip'] || req.ip
    });
  }
  
  next();
});

// Health check endpoint
app.get('/health', (req, res) => {
  const span = trace.getActiveSpan();
  span?.addEvent('health-check-requested');
  
  res.json({ 
    status: 'healthy', 
    timestamp: new Date().toISOString(),
    service: 'order-service',
    version: process.env.SERVICE_VERSION 
  });
});

// Business logic endpoints
app.post('/orders', async (req, res) => {
  const tracer = trace.getActiveTracer();
  const span = tracer.startSpan('create-order-endpoint');
  
  try {
    const orderData = req.body;
    
    // Validate input
    if (!orderData.customerId || !orderData.items) {
      span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid input' });
      return res.status(400).json({ error: 'Invalid order data' });
    }

    // Process order (this will create child spans automatically)
    const result = await processOrder(orderData);
    
    span.setAttributes({
      'order.id': result.orderId,
      'order.items.count': orderData.items.length,
      'order.total.amount': result.totalAmount
    });
    
    res.status(201).json(result);
  } catch (error) {
    span.recordException(error);
    span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
    res.status(500).json({ error: 'Internal server error' });
  } finally {
    span.end();
  }
});

function generateRequestId() {
  return Math.random().toString(36).substring(2) + Date.now().toString(36);
}

const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
  console.log(`Server running on port ${PORT}`);
});

Python (FastAPI) Setup

# tracing.py
from opentelemetry import trace, metrics
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentator
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor

# Configure resource
resource = Resource.create({
    SERVICE_NAME: "order-service-python",
    SERVICE_VERSION: "1.0.0"
})

# Setup tracing
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)

# Setup Jaeger exporter
jaeger_exporter = JaegerExporter(
    agent_host_name="localhost",
    agent_port=6831,
)

# Add span processor
span_processor = BatchSpanProcessor(jaeger_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)

# Setup metrics
prometheus_reader = PrometheusMetricReader(port=8000)
metrics.set_meter_provider(MeterProvider(
    resource=resource,
    metric_readers=[prometheus_reader]
))

meter = metrics.get_meter(__name__)

# Auto-instrumentation
RequestsInstrumentor().instrument()
SQLAlchemyInstrumentor().instrument()
RedisInstrumentor().instrument()

# Custom metrics
order_counter = meter.create_counter(
    name="orders_created_total",
    description="Total number of orders created"
)

request_duration = meter.create_histogram(
    name="http_request_duration_seconds",
    description="HTTP request duration in seconds"
)
# main.py
from fastapi import FastAPI, HTTPException, Depends
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentator
from opentelemetry import trace
import logging
import time
from typing import Dict, Any
import asyncio

# Import tracing setup
import tracing

app = FastAPI(title="Order Service", version="1.0.0")

# Auto-instrument FastAPI
FastAPIInstrumentor.instrument_app(app)

# Get tracer
tracer = trace.get_tracer(__name__)

# Structured logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class OrderService:
    def __init__(self):
        self.orders_db = {}  # Mock database
    
    async def create_order(self, order_data: Dict[str, Any]) -> Dict[str, Any]:
        # Create a span for this operation
        with tracer.start_as_current_span("create_order") as span:
            # Add span attributes
            span.set_attributes({
                "order.customer_id": order_data.get("customer_id"),
                "order.items_count": len(order_data.get("items", [])),
                "operation": "create_order"
            })
            
            # Simulate processing time
            await asyncio.sleep(0.1)
            
            # Validate order
            if not order_data.get("customer_id"):
                span.record_exception(ValueError("Customer ID is required"))
                raise HTTPException(status_code=400, detail="Customer ID is required")
            
            # Create order
            order_id = f"order_{int(time.time())}"
            order = {
                "id": order_id,
                "customer_id": order_data["customer_id"],
                "items": order_data.get("items", []),
                "status": "created",
                "created_at": time.time()
            }
            
            # Save to "database"
            self.orders_db[order_id] = order
            
            # Add event to span
            span.add_event("order_created", {
                "order.id": order_id,
                "order.status": "created"
            })
            
            # Record metrics
            tracing.order_counter.add(1, {
                "customer_id": order_data["customer_id"],
                "status": "created"
            })
            
            logger.info("Order created successfully", extra={
                "order_id": order_id,
                "customer_id": order_data["customer_id"],
                "trace_id": format(span.get_span_context().trace_id, "032x"),
                "span_id": format(span.get_span_context().span_id, "016x")
            })
            
            return order

order_service = OrderService()

@app.middleware("http")
async def add_process_time_header(request, call_next):
    start_time = time.time()
    response = await call_next(request)
    process_time = time.time() - start_time
    
    # Record request duration metric
    tracing.request_duration.record(process_time, {
        "method": request.method,
        "endpoint": str(request.url.path),
        "status_code": str(response.status_code)
    })
    
    response.headers["X-Process-Time"] = str(process_time)
    return response

@app.get("/health")
async def health_check():
    with tracer.start_as_current_span("health_check") as span:
        span.add_event("health_check_requested")
        return {
            "status": "healthy",
            "service": "order-service-python",
            "version": "1.0.0"
        }

@app.post("/orders")
async def create_order(order_data: Dict[str, Any]):
    return await order_service.create_order(order_data)

@app.get("/orders/{order_id}")
async def get_order(order_id: str):
    with tracer.start_as_current_span("get_order") as span:
        span.set_attributes({
            "order.id": order_id,
            "operation": "get_order"
        })
        
        order = order_service.orders_db.get(order_id)
        if not order:
            span.record_exception(HTTPException(status_code=404, detail="Order not found"))
            raise HTTPException(status_code=404, detail="Order not found")
        
        return order

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Distributed Tracing

Cross-Service Communication

// Service A (Order Service)
const axios = require('axios');
const { trace, context, propagation } = require('@opentelemetry/api');

async function callInventoryService(orderId, items) {
  const span = trace.getActiveTracer().startSpan('call-inventory-service');
  
  try {
    // Inject trace context into HTTP headers
    const headers = {};
    propagation.inject(context.active(), headers);
    
    const response = await axios.post('http://inventory-service:3001/check', {
      orderId,
      items
    }, { headers });
    
    span.setAttributes({
      'http.method': 'POST',
      'http.url': 'http://inventory-service:3001/check',
      'http.status_code': response.status,
      'inventory.items_count': items.length
    });
    
    return response.data;
  } catch (error) {
    span.recordException(error);
    throw error;
  } finally {
    span.end();
  }
}
// Service B (Inventory Service)
const express = require('express');
const { trace, context, propagation } = require('@opentelemetry/api');

app.post('/check', (req, res) => {
  // Extract trace context from incoming headers
  const parentContext = propagation.extract(context.active(), req.headers);
  
  // Start a new span with the extracted context as parent
  const span = trace.getActiveTracer().startSpan('check-inventory', {}, parentContext);
  
  context.with(context.setSpan(parentContext, span), async () => {
    try {
      const { orderId, items } = req.body;
      
      span.setAttributes({
        'inventory.order_id': orderId,
        'inventory.items_count': items.length
      });

      // Simulate inventory check
      const availability = await checkItemsAvailability(items);
      
      span.addEvent('inventory-checked', {
        'result': availability.allAvailable ? 'available' : 'unavailable',
        'available_items': availability.availableCount
      });
      
      res.json(availability);
    } catch (error) {
      span.recordException(error);
      res.status(500).json({ error: error.message });
    } finally {
      span.end();
    }
  });
});

Database Instrumentation

const { MongoClient } = require('mongodb');
const { trace } = require('@opentelemetry/api');

class OrderRepository {
  constructor(mongoUrl) {
    this.client = new MongoClient(mongoUrl);
    this.db = null;
  }

  async connect() {
    await this.client.connect();
    this.db = this.client.db('orders');
  }

  async createOrder(orderData) {
    const span = trace.getActiveTracer().startSpan('db-create-order');
    
    try {
      span.setAttributes({
        'db.operation': 'create',
        'db.collection': 'orders',
        'order.customer_id': orderData.customerId
      });

      // MongoDB instrumentation จะสร้าง child spans อัตโนมัติ
      const result = await this.db.collection('orders').insertOne({
        ...orderData,
        createdAt: new Date(),
        updatedAt: new Date()
      });
      
      span.setAttributes({
        'db.inserted_id': result.insertedId.toString(),
        'db.acknowledged': result.acknowledged
      });
      
      return result;
    } catch (error) {
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  }

  async findOrderById(orderId) {
    const span = trace.getActiveTracer().startSpan('db-find-order');
    
    try {
      span.setAttributes({
        'db.operation': 'findOne',
        'db.collection': 'orders',
        'order.id': orderId
      });

      const order = await this.db.collection('orders').findOne({
        _id: new ObjectId(orderId)
      });
      
      span.setAttributes({
        'db.found': order !== null
      });
      
      return order;
    } catch (error) {
      span.recordException(error);
      throw error;
    } finally {
      span.end();
    }
  }
}

Custom Instrumentation

Business Logic Tracing

class PaymentProcessor {
  constructor() {
    this.tracer = trace.getTracer('payment-processor', '1.0.0');
  }

  async processPayment(orderId, paymentData) {
    return this.tracer.startActiveSpan('process-payment', async (span) => {
      try {
        span.setAttributes({
          'payment.order_id': orderId,
          'payment.method': paymentData.method,
          'payment.amount': paymentData.amount,
          'payment.currency': paymentData.currency
        });

        // Step 1: Validate payment data
        await this.validatePaymentData(paymentData);
        
        // Step 2: Check fraud
        const fraudCheck = await this.checkFraud(paymentData);
        span.addEvent('fraud-check-completed', {
          'fraud.score': fraudCheck.score,
          'fraud.approved': fraudCheck.approved
        });
        
        if (!fraudCheck.approved) {
          throw new Error('Payment rejected due to fraud detection');
        }
        
        // Step 3: Charge payment method
        const chargeResult = await this.chargePaymentMethod(paymentData);
        
        // Step 4: Record transaction
        const transaction = await this.recordTransaction(orderId, chargeResult);
        
        span.setAttributes({
          'payment.transaction_id': transaction.id,
          'payment.status': 'completed'
        });
        
        span.setStatus({ code: SpanStatusCode.OK });
        return transaction;
        
      } catch (error) {
        span.recordException(error);
        span.setStatus({ 
          code: SpanStatusCode.ERROR, 
          message: error.message 
        });
        throw error;
      }
    });
  }

  async validatePaymentData(paymentData) {
    return this.tracer.startActiveSpan('validate-payment-data', async (span) => {
      // Validation logic with detailed tracing
      span.setAttributes({
        'validation.method': paymentData.method,
        'validation.has_card_number': !!paymentData.cardNumber,
        'validation.has_cvv': !!paymentData.cvv
      });
      
      if (!paymentData.cardNumber) {
        throw new Error('Card number is required');
      }
      
      // More validation...
      span.addEvent('validation-completed', { 'result': 'valid' });
    });
  }

  async checkFraud(paymentData) {
    return this.tracer.startActiveSpan('fraud-check', async (span) => {
      span.setAttributes({
        'fraud.check_type': 'ml_model',
        'fraud.model_version': '2.1.0'
      });
      
      // Simulate ML model call
      const score = Math.random();
      const approved = score < 0.8;
      
      span.setAttributes({
        'fraud.score': score,
        'fraud.threshold': 0.8,
        'fraud.approved': approved
      });
      
      return { score, approved };
    });
  }
}

Performance Monitoring

class PerformanceTracker {
  constructor() {
    this.meter = metrics.getMeter('performance-tracker', '1.0.0');
    
    this.responseTimeHistogram = this.meter.createHistogram('http_request_duration_ms', {
      description: 'HTTP request duration in milliseconds'
    });
    
    this.activeRequestsGauge = this.meter.createUpDownCounter('http_active_requests', {
      description: 'Number of active HTTP requests'
    });
    
    this.errorCounter = this.meter.createCounter('http_errors_total', {
      description: 'Total number of HTTP errors'
    });
  }

  trackRequest(req, res, next) {
    const startTime = Date.now();
    
    // Increment active requests
    this.activeRequestsGauge.add(1, {
      method: req.method,
      route: req.route?.path || req.path
    });

    // Track when request finishes
    res.on('finish', () => {
      const duration = Date.now() - startTime;
      const labels = {
        method: req.method,
        route: req.route?.path || req.path,
        status_code: res.statusCode.toString(),
        status_class: `${Math.floor(res.statusCode / 100)}xx`
      };

      // Record response time
      this.responseTimeHistogram.record(duration, labels);
      
      // Decrement active requests
      this.activeRequestsGauge.add(-1, {
        method: req.method,
        route: req.route?.path || req.path
      });

      // Count errors
      if (res.statusCode >= 400) {
        this.errorCounter.add(1, {
          ...labels,
          error_type: res.statusCode >= 500 ? 'server_error' : 'client_error'
        });
      }

      // Add trace attributes for performance
      const span = trace.getActiveSpan();
      if (span) {
        span.setAttributes({
          'http.response_time_ms': duration,
          'performance.slow_request': duration > 1000
        });
        
        if (duration > 5000) {
          span.addEvent('slow-request-detected', {
            'threshold_ms': 5000,
            'actual_ms': duration
          });
        }
      }
    });

    next();
  }
}

// การใช้งาน
const performanceTracker = new PerformanceTracker();
app.use(performanceTracker.trackRequest.bind(performanceTracker));

Monitoring Stack Setup

Docker Compose for Observability Stack

# docker-compose.observability.yml
version: '3.8'

services:
  # Jaeger - Distributed Tracing
  jaeger:
    image: jaegertracing/all-in-one:1.50
    ports:
      - "16686:16686"  # Jaeger UI
      - "14268:14268"  # Jaeger collector HTTP
      - "6831:6831/udp"  # Jaeger agent UDP
    environment:
      - COLLECTOR_OTLP_ENABLED=true
    networks:
      - observability

  # Prometheus - Metrics Collection
  prometheus:
    image: prom/prometheus:v2.45.0
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
    networks:
      - observability

  # Grafana - Visualization
  grafana:
    image: grafana/grafana:10.1.0
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning
    networks:
      - observability

  # Elasticsearch - Log Storage
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
    ports:
      - "9200:9200"
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
    volumes:
      - elasticsearch_data:/usr/share/elasticsearch/data
    networks:
      - observability

  # Kibana - Log Visualization
  kibana:
    image: docker.elastic.co/kibana/kibana:8.9.0
    ports:
      - "5601:5601"
    environment:
      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
    depends_on:
      - elasticsearch
    networks:
      - observability

  # Logstash - Log Processing
  logstash:
    image: docker.elastic.co/logstash/logstash:8.9.0
    ports:
      - "5044:5044"
      - "12201:12201/udp"
    volumes:
      - ./logstash/pipeline:/usr/share/logstash/pipeline
      - ./logstash/config:/usr/share/logstash/config
    depends_on:
      - elasticsearch
    networks:
      - observability

volumes:
  prometheus_data:
  grafana_data:
  elasticsearch_data:

networks:
  observability:
    driver: bridge

Prometheus Configuration

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "alert_rules.yml"

scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'order-service'
    static_configs:
      - targets: ['order-service:9464']
    metrics_path: /metrics
    scrape_interval: 10s

  - job_name: 'inventory-service'
    static_configs:
      - targets: ['inventory-service:9464']

  - job_name: 'payment-service'
    static_configs:
      - targets: ['payment-service:9464']

  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

Grafana Dashboard

{
  "dashboard": {
    "title": "Microservices Observability",
    "panels": [
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{service}} - {{method}} {{route}}"
          }
        ]
      },
      {
        "title": "Response Time (95th percentile)",
        "type": "graph", 
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m]))",
            "legendFormat": "{{service}} - 95th percentile"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(http_errors_total[5m]) / rate(http_requests_total[5m])",
            "legendFormat": "{{service}} - Error Rate"
          }
        ]
      },
      {
        "title": "Service Map",
        "type": "nodeGraph",
        "targets": [
          {
            "expr": "sum(rate(traces_spanmetrics_calls_total[5m])) by (service_name, span_name)"
          }
        ]
      }
    ]
  }
}

Best Practices

1. Sampling Strategy

// Intelligent sampling
const { TraceIdRatioBasedSampler, ParentBasedSampler } = require('@opentelemetry/sdk-trace-base');

class IntelligentSampler {
  constructor() {
    this.errorSampler = new AlwaysOnSampler(); // Always sample errors
    this.slowRequestSampler = new AlwaysOnSampler(); // Always sample slow requests
    this.normalSampler = new TraceIdRatioBasedSampler(0.1); // 10% sampling for normal requests
  }

  shouldSample(context, traceId, spanName, spanKind, attributes, links) {
    // Always sample if there's an error
    if (attributes['error'] || attributes['http.status_code'] >= 400) {
      return this.errorSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
    }
    
    // Always sample slow requests
    if (attributes['http.response_time_ms'] > 1000) {
      return this.slowRequestSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
    }
    
    // For critical endpoints, use higher sampling rate
    const criticalEndpoints = ['/orders', '/payments', '/checkout'];
    if (criticalEndpoints.some(endpoint => spanName.includes(endpoint))) {
      return new TraceIdRatioBasedSampler(0.5).shouldSample(context, traceId, spanName, spanKind, attributes, links);
    }
    
    // Default sampling
    return this.normalSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
  }
}

2. Context Propagation

// Custom context propagation
class BusinessContextPropagator {
  inject(context, carrier, setter) {
    const span = trace.getSpanFromContext(context);
    if (span) {
      // Propagate business context
      setter.set(carrier, 'x-user-id', span.getAttribute('user.id'));
      setter.set(carrier, 'x-tenant-id', span.getAttribute('tenant.id'));
      setter.set(carrier, 'x-correlation-id', span.getAttribute('correlation.id'));
    }
  }

  extract(context, carrier, getter) {
    const userId = getter.get(carrier, 'x-user-id');
    const tenantId = getter.get(carrier, 'x-tenant-id'); 
    const correlationId = getter.get(carrier, 'x-correlation-id');
    
    // Add to current span
    const span = trace.getActiveSpan();
    if (span && userId) {
      span.setAttributes({
        'user.id': userId,
        'tenant.id': tenantId,
        'correlation.id': correlationId
      });
    }
    
    return context;
  }

  fields() {
    return ['x-user-id', 'x-tenant-id', 'x-correlation-id'];
  }
}

// Register the propagator
propagation.setGlobalPropagator(
  new CompositePropagator({
    propagators: [
      new TraceContextPropagator(),
      new BusinessContextPropagator()
    ]
  })
);

3. Alert Rules

# alert_rules.yml
groups:
  - name: microservices_alerts
    rules:
      - alert: HighErrorRate
        expr: rate(http_errors_total[5m]) / rate(http_requests_total[5m]) > 0.1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"

      - alert: HighResponseTime
        expr: histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m])) > 2000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High response time detected"
          description: "95th percentile response time is {{ $value }}ms for service {{ $labels.service }}"

      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service is down"
          description: "Service {{ $labels.instance }} has been down for more than 1 minute"

สรุป

OpenTelemetry เป็นเครื่องมือที่ทรงพลังสำหรับการสร้าง Observability ใน Microservices:

  1. Unified Observability: รวม Traces, Metrics และ Logs ไว้ในที่เดียว
  2. Vendor Agnostic: ไม่ผูกติดกับ Monitoring Tool ใดๆ
  3. Auto-Instrumentation: ติดตั้งง่าย ได้ข้อมูลทันที
  4. Custom Instrumentation: ปรับแต่งได้ตามความต้องการ
  5. Distributed Tracing: ติดตาม Request ข้าม Services ได้
  6. Performance Monitoring: วัดประสิทธิภาพแบบ Real-time
  7. Alerting: แจ้งเตือนเมื่อมีปัญหา

การใช้ OpenTelemetry ช่วยให้เราเข้าใจระบบได้ลึกขึ้น แก้ปัญหาได้เร็วขึ้น และปรับปรุงประสิทธิภาพได้อย่างมีเป้าหมาย ทำให้ระบบ Microservices มีความเสถียรและน่าเชื่อถือมากขึ้น!