OpenTelemetry คืออะไร?
OpenTelemetry (OTel) เป็น Observability Framework แบบ Open-source ที่ช่วยให้เราเก็บ ประมวลผล และส่งออก Telemetry Data (Traces, Metrics, Logs) จากแอปพลิเคชัน
The Three Pillars of Observability
graph LR
A[Traces] --> D[Complete Observability]
B[Metrics] --> D
C[Logs] --> D
A --> A1["What happened?<br/>Request flow<br/>Performance bottlenecks"]
B --> B1["How much?<br/>System health<br/>Resource usage"]
C --> C1["Why did it happen?<br/>Error details<br/>Context information"]
Core Concepts
1. Traces และ Spans
Trace คือการติดตาม Request ตั้งแต่เริ่มต้นจนจบ
Span คือหน่วยงานย่อยภายใน Trace
// Node.js Example
const { trace } = require('@opentelemetry/api');
async function processOrder(orderId) {
// Create a span
const span = trace.getActiveTracer().startSpan('process-order');
try {
// Add attributes to span
span.setAttributes({
'order.id': orderId,
'user.id': getCurrentUserId(),
'service.version': process.env.SERVICE_VERSION
});
// Child operations create child spans automatically
const user = await getUserById(span.getSpanContext(), userId);
const inventory = await checkInventory(orderId);
const payment = await processPayment(orderId, user.paymentMethod);
// Add events to span
span.addEvent('order-validated', {
'validation.result': 'success',
'inventory.available': inventory.quantity
});
span.setStatus({ code: SpanStatusCode.OK });
return { success: true, orderId };
} catch (error) {
// Record exceptions
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
2. Metrics
Metrics คือข้อมูลตัวเลขที่วัดได้ เช่น Counter, Gauge, Histogram
const { metrics } = require('@opentelemetry/api');
class OrderMetrics {
constructor() {
const meter = metrics.getMeter('order-service', '1.0.0');
// Counter - จำนวนที่เพิ่มขึ้นเรื่อยๆ
this.ordersTotal = meter.createCounter('orders_total', {
description: 'Total number of orders processed'
});
// Gauge - ค่าที่เปลี่ยนแปลงได้
this.activeConnections = meter.createUpDownCounter('active_connections', {
description: 'Number of active database connections'
});
// Histogram - การแจกแจงของค่า
this.requestDuration = meter.createHistogram('request_duration_ms', {
description: 'Request duration in milliseconds',
unit: 'ms'
});
}
recordOrderCreated(customerId, region, orderType) {
this.ordersTotal.add(1, {
customer_id: customerId,
region: region,
order_type: orderType,
status: 'created'
});
}
recordRequestDuration(duration, endpoint, method, statusCode) {
this.requestDuration.record(duration, {
endpoint: endpoint,
method: method,
status_code: statusCode.toString()
});
}
}
// การใช้งาน
const orderMetrics = new OrderMetrics();
app.use((req, res, next) => {
const startTime = Date.now();
res.on('finish', () => {
const duration = Date.now() - startTime;
orderMetrics.recordRequestDuration(
duration,
req.route?.path || req.path,
req.method,
res.statusCode
);
});
next();
});
3. Logs (Structured Logging)
const { logs } = require('@opentelemetry/api');
const winston = require('winston');
class StructuredLogger {
constructor() {
this.logger = winston.createLogger({
level: 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json()
),
transports: [
new winston.transports.Console(),
new winston.transports.File({ filename: 'app.log' })
]
});
}
logWithTrace(level, message, metadata = {}) {
const span = trace.getActiveSpan();
const spanContext = span?.spanContext();
this.logger.log(level, message, {
...metadata,
traceId: spanContext?.traceId,
spanId: spanContext?.spanId,
traceFlags: spanContext?.traceFlags,
timestamp: new Date().toISOString(),
service: 'order-service',
version: process.env.SERVICE_VERSION
});
}
info(message, metadata) {
this.logWithTrace('info', message, metadata);
}
error(message, error, metadata = {}) {
this.logWithTrace('error', message, {
...metadata,
error: {
name: error.name,
message: error.message,
stack: error.stack
}
});
}
}
// การใช้งาน
const logger = new StructuredLogger();
async function processPayment(orderId, paymentMethod) {
const span = trace.getActiveTracer().startSpan('process-payment');
try {
logger.info('Processing payment', {
orderId,
paymentMethod: paymentMethod.type,
amount: paymentMethod.amount
});
const result = await paymentGateway.charge(paymentMethod);
logger.info('Payment processed successfully', {
orderId,
transactionId: result.transactionId,
amount: result.amount
});
return result;
} catch (error) {
logger.error('Payment processing failed', error, {
orderId,
paymentMethod: paymentMethod.type
});
throw error;
} finally {
span.end();
}
}
OpenTelemetry Setup
Node.js Complete Setup
// tracing.js
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
// Exporters
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { PrometheusExporter } = require('@opentelemetry/exporter-prometheus');
const { OTLPTraceExporter } = require('@opentelemetry/exporter-otlp-http');
// Create resource with service information
const resource = new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: 'order-service',
[SemanticResourceAttributes.SERVICE_VERSION]: '1.0.0',
[SemanticResourceAttributes.SERVICE_INSTANCE_ID]: process.env.HOSTNAME || 'localhost',
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: process.env.NODE_ENV || 'development'
});
// Configure trace exporter
const traceExporter = process.env.NODE_ENV === 'production'
? new OTLPTraceExporter({
url: 'https://api.honeycomb.io/v1/traces',
headers: {
'x-honeycomb-team': process.env.HONEYCOMB_API_KEY
}
})
: new JaegerExporter({
endpoint: 'http://localhost:14268/api/traces'
});
// Configure metrics exporter
const metricsExporter = new PrometheusExporter({
port: 9464
});
// Initialize SDK
const sdk = new NodeSDK({
resource,
traceExporter,
metricExporter: metricsExporter,
instrumentations: [
getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-fs': {
enabled: false // ปิด file system instrumentation เพื่อลด noise
},
'@opentelemetry/instrumentation-http': {
enabled: true,
requestHook: (span, request) => {
span.setAttributes({
'http.request_id': request.headers['x-request-id'],
'user.id': request.headers['x-user-id']
});
},
responseHook: (span, response) => {
span.setAttributes({
'http.response.size': response.headers['content-length']
});
}
},
'@opentelemetry/instrumentation-express': {
enabled: true
},
'@opentelemetry/instrumentation-mongodb': {
enabled: true,
enhancedDatabaseReporting: true
},
'@opentelemetry/instrumentation-redis': {
enabled: true,
dbStatementSerializer: (cmdName, cmdArgs) => {
return `${cmdName} ${cmdArgs[0]}`; // แสดงเฉพาะ command และ key แรก
}
}
})
]
});
// Start tracing before importing application
sdk.start();
// Graceful shutdown
process.on('SIGTERM', () => {
sdk.shutdown()
.then(() => console.log('Telemetry terminated'))
.catch((error) => console.log('Error terminating telemetry', error))
.finally(() => process.exit(0));
});
module.exports = sdk;
// app.js
// Import tracing FIRST, before any other modules
require('./tracing');
const express = require('express');
const { trace, context, propagation } = require('@opentelemetry/api');
const app = express();
// Custom instrumentation middleware
app.use((req, res, next) => {
const span = trace.getActiveSpan();
// Extract trace context from incoming headers
const parentContext = propagation.extract(context.active(), req.headers);
// Add custom attributes
if (span) {
span.setAttributes({
'http.request.id': req.headers['x-request-id'] || generateRequestId(),
'user.id': req.headers['x-user-id'],
'http.user_agent': req.headers['user-agent'],
'http.real_ip': req.headers['x-real-ip'] || req.ip
});
}
next();
});
// Health check endpoint
app.get('/health', (req, res) => {
const span = trace.getActiveSpan();
span?.addEvent('health-check-requested');
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
service: 'order-service',
version: process.env.SERVICE_VERSION
});
});
// Business logic endpoints
app.post('/orders', async (req, res) => {
const tracer = trace.getActiveTracer();
const span = tracer.startSpan('create-order-endpoint');
try {
const orderData = req.body;
// Validate input
if (!orderData.customerId || !orderData.items) {
span.setStatus({ code: SpanStatusCode.ERROR, message: 'Invalid input' });
return res.status(400).json({ error: 'Invalid order data' });
}
// Process order (this will create child spans automatically)
const result = await processOrder(orderData);
span.setAttributes({
'order.id': result.orderId,
'order.items.count': orderData.items.length,
'order.total.amount': result.totalAmount
});
res.status(201).json(result);
} catch (error) {
span.recordException(error);
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message });
res.status(500).json({ error: 'Internal server error' });
} finally {
span.end();
}
});
function generateRequestId() {
return Math.random().toString(36).substring(2) + Date.now().toString(36);
}
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`Server running on port ${PORT}`);
});
Python (FastAPI) Setup
# tracing.py
from opentelemetry import trace, metrics
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentator
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.requests import RequestsInstrumentor
# Configure resource
resource = Resource.create({
SERVICE_NAME: "order-service-python",
SERVICE_VERSION: "1.0.0"
})
# Setup tracing
trace.set_tracer_provider(TracerProvider(resource=resource))
tracer = trace.get_tracer(__name__)
# Setup Jaeger exporter
jaeger_exporter = JaegerExporter(
agent_host_name="localhost",
agent_port=6831,
)
# Add span processor
span_processor = BatchSpanProcessor(jaeger_exporter)
trace.get_tracer_provider().add_span_processor(span_processor)
# Setup metrics
prometheus_reader = PrometheusMetricReader(port=8000)
metrics.set_meter_provider(MeterProvider(
resource=resource,
metric_readers=[prometheus_reader]
))
meter = metrics.get_meter(__name__)
# Auto-instrumentation
RequestsInstrumentor().instrument()
SQLAlchemyInstrumentor().instrument()
RedisInstrumentor().instrument()
# Custom metrics
order_counter = meter.create_counter(
name="orders_created_total",
description="Total number of orders created"
)
request_duration = meter.create_histogram(
name="http_request_duration_seconds",
description="HTTP request duration in seconds"
)
# main.py
from fastapi import FastAPI, HTTPException, Depends
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentator
from opentelemetry import trace
import logging
import time
from typing import Dict, Any
import asyncio
# Import tracing setup
import tracing
app = FastAPI(title="Order Service", version="1.0.0")
# Auto-instrument FastAPI
FastAPIInstrumentor.instrument_app(app)
# Get tracer
tracer = trace.get_tracer(__name__)
# Structured logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class OrderService:
def __init__(self):
self.orders_db = {} # Mock database
async def create_order(self, order_data: Dict[str, Any]) -> Dict[str, Any]:
# Create a span for this operation
with tracer.start_as_current_span("create_order") as span:
# Add span attributes
span.set_attributes({
"order.customer_id": order_data.get("customer_id"),
"order.items_count": len(order_data.get("items", [])),
"operation": "create_order"
})
# Simulate processing time
await asyncio.sleep(0.1)
# Validate order
if not order_data.get("customer_id"):
span.record_exception(ValueError("Customer ID is required"))
raise HTTPException(status_code=400, detail="Customer ID is required")
# Create order
order_id = f"order_{int(time.time())}"
order = {
"id": order_id,
"customer_id": order_data["customer_id"],
"items": order_data.get("items", []),
"status": "created",
"created_at": time.time()
}
# Save to "database"
self.orders_db[order_id] = order
# Add event to span
span.add_event("order_created", {
"order.id": order_id,
"order.status": "created"
})
# Record metrics
tracing.order_counter.add(1, {
"customer_id": order_data["customer_id"],
"status": "created"
})
logger.info("Order created successfully", extra={
"order_id": order_id,
"customer_id": order_data["customer_id"],
"trace_id": format(span.get_span_context().trace_id, "032x"),
"span_id": format(span.get_span_context().span_id, "016x")
})
return order
order_service = OrderService()
@app.middleware("http")
async def add_process_time_header(request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
# Record request duration metric
tracing.request_duration.record(process_time, {
"method": request.method,
"endpoint": str(request.url.path),
"status_code": str(response.status_code)
})
response.headers["X-Process-Time"] = str(process_time)
return response
@app.get("/health")
async def health_check():
with tracer.start_as_current_span("health_check") as span:
span.add_event("health_check_requested")
return {
"status": "healthy",
"service": "order-service-python",
"version": "1.0.0"
}
@app.post("/orders")
async def create_order(order_data: Dict[str, Any]):
return await order_service.create_order(order_data)
@app.get("/orders/{order_id}")
async def get_order(order_id: str):
with tracer.start_as_current_span("get_order") as span:
span.set_attributes({
"order.id": order_id,
"operation": "get_order"
})
order = order_service.orders_db.get(order_id)
if not order:
span.record_exception(HTTPException(status_code=404, detail="Order not found"))
raise HTTPException(status_code=404, detail="Order not found")
return order
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
Distributed Tracing
Cross-Service Communication
// Service A (Order Service)
const axios = require('axios');
const { trace, context, propagation } = require('@opentelemetry/api');
async function callInventoryService(orderId, items) {
const span = trace.getActiveTracer().startSpan('call-inventory-service');
try {
// Inject trace context into HTTP headers
const headers = {};
propagation.inject(context.active(), headers);
const response = await axios.post('http://inventory-service:3001/check', {
orderId,
items
}, { headers });
span.setAttributes({
'http.method': 'POST',
'http.url': 'http://inventory-service:3001/check',
'http.status_code': response.status,
'inventory.items_count': items.length
});
return response.data;
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
// Service B (Inventory Service)
const express = require('express');
const { trace, context, propagation } = require('@opentelemetry/api');
app.post('/check', (req, res) => {
// Extract trace context from incoming headers
const parentContext = propagation.extract(context.active(), req.headers);
// Start a new span with the extracted context as parent
const span = trace.getActiveTracer().startSpan('check-inventory', {}, parentContext);
context.with(context.setSpan(parentContext, span), async () => {
try {
const { orderId, items } = req.body;
span.setAttributes({
'inventory.order_id': orderId,
'inventory.items_count': items.length
});
// Simulate inventory check
const availability = await checkItemsAvailability(items);
span.addEvent('inventory-checked', {
'result': availability.allAvailable ? 'available' : 'unavailable',
'available_items': availability.availableCount
});
res.json(availability);
} catch (error) {
span.recordException(error);
res.status(500).json({ error: error.message });
} finally {
span.end();
}
});
});
Database Instrumentation
const { MongoClient } = require('mongodb');
const { trace } = require('@opentelemetry/api');
class OrderRepository {
constructor(mongoUrl) {
this.client = new MongoClient(mongoUrl);
this.db = null;
}
async connect() {
await this.client.connect();
this.db = this.client.db('orders');
}
async createOrder(orderData) {
const span = trace.getActiveTracer().startSpan('db-create-order');
try {
span.setAttributes({
'db.operation': 'create',
'db.collection': 'orders',
'order.customer_id': orderData.customerId
});
// MongoDB instrumentation จะสร้าง child spans อัตโนมัติ
const result = await this.db.collection('orders').insertOne({
...orderData,
createdAt: new Date(),
updatedAt: new Date()
});
span.setAttributes({
'db.inserted_id': result.insertedId.toString(),
'db.acknowledged': result.acknowledged
});
return result;
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
async findOrderById(orderId) {
const span = trace.getActiveTracer().startSpan('db-find-order');
try {
span.setAttributes({
'db.operation': 'findOne',
'db.collection': 'orders',
'order.id': orderId
});
const order = await this.db.collection('orders').findOne({
_id: new ObjectId(orderId)
});
span.setAttributes({
'db.found': order !== null
});
return order;
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
}
Custom Instrumentation
Business Logic Tracing
class PaymentProcessor {
constructor() {
this.tracer = trace.getTracer('payment-processor', '1.0.0');
}
async processPayment(orderId, paymentData) {
return this.tracer.startActiveSpan('process-payment', async (span) => {
try {
span.setAttributes({
'payment.order_id': orderId,
'payment.method': paymentData.method,
'payment.amount': paymentData.amount,
'payment.currency': paymentData.currency
});
// Step 1: Validate payment data
await this.validatePaymentData(paymentData);
// Step 2: Check fraud
const fraudCheck = await this.checkFraud(paymentData);
span.addEvent('fraud-check-completed', {
'fraud.score': fraudCheck.score,
'fraud.approved': fraudCheck.approved
});
if (!fraudCheck.approved) {
throw new Error('Payment rejected due to fraud detection');
}
// Step 3: Charge payment method
const chargeResult = await this.chargePaymentMethod(paymentData);
// Step 4: Record transaction
const transaction = await this.recordTransaction(orderId, chargeResult);
span.setAttributes({
'payment.transaction_id': transaction.id,
'payment.status': 'completed'
});
span.setStatus({ code: SpanStatusCode.OK });
return transaction;
} catch (error) {
span.recordException(error);
span.setStatus({
code: SpanStatusCode.ERROR,
message: error.message
});
throw error;
}
});
}
async validatePaymentData(paymentData) {
return this.tracer.startActiveSpan('validate-payment-data', async (span) => {
// Validation logic with detailed tracing
span.setAttributes({
'validation.method': paymentData.method,
'validation.has_card_number': !!paymentData.cardNumber,
'validation.has_cvv': !!paymentData.cvv
});
if (!paymentData.cardNumber) {
throw new Error('Card number is required');
}
// More validation...
span.addEvent('validation-completed', { 'result': 'valid' });
});
}
async checkFraud(paymentData) {
return this.tracer.startActiveSpan('fraud-check', async (span) => {
span.setAttributes({
'fraud.check_type': 'ml_model',
'fraud.model_version': '2.1.0'
});
// Simulate ML model call
const score = Math.random();
const approved = score < 0.8;
span.setAttributes({
'fraud.score': score,
'fraud.threshold': 0.8,
'fraud.approved': approved
});
return { score, approved };
});
}
}
Performance Monitoring
class PerformanceTracker {
constructor() {
this.meter = metrics.getMeter('performance-tracker', '1.0.0');
this.responseTimeHistogram = this.meter.createHistogram('http_request_duration_ms', {
description: 'HTTP request duration in milliseconds'
});
this.activeRequestsGauge = this.meter.createUpDownCounter('http_active_requests', {
description: 'Number of active HTTP requests'
});
this.errorCounter = this.meter.createCounter('http_errors_total', {
description: 'Total number of HTTP errors'
});
}
trackRequest(req, res, next) {
const startTime = Date.now();
// Increment active requests
this.activeRequestsGauge.add(1, {
method: req.method,
route: req.route?.path || req.path
});
// Track when request finishes
res.on('finish', () => {
const duration = Date.now() - startTime;
const labels = {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode.toString(),
status_class: `${Math.floor(res.statusCode / 100)}xx`
};
// Record response time
this.responseTimeHistogram.record(duration, labels);
// Decrement active requests
this.activeRequestsGauge.add(-1, {
method: req.method,
route: req.route?.path || req.path
});
// Count errors
if (res.statusCode >= 400) {
this.errorCounter.add(1, {
...labels,
error_type: res.statusCode >= 500 ? 'server_error' : 'client_error'
});
}
// Add trace attributes for performance
const span = trace.getActiveSpan();
if (span) {
span.setAttributes({
'http.response_time_ms': duration,
'performance.slow_request': duration > 1000
});
if (duration > 5000) {
span.addEvent('slow-request-detected', {
'threshold_ms': 5000,
'actual_ms': duration
});
}
}
});
next();
}
}
// การใช้งาน
const performanceTracker = new PerformanceTracker();
app.use(performanceTracker.trackRequest.bind(performanceTracker));
Monitoring Stack Setup
Docker Compose for Observability Stack
# docker-compose.observability.yml
version: '3.8'
services:
# Jaeger - Distributed Tracing
jaeger:
image: jaegertracing/all-in-one:1.50
ports:
- "16686:16686" # Jaeger UI
- "14268:14268" # Jaeger collector HTTP
- "6831:6831/udp" # Jaeger agent UDP
environment:
- COLLECTOR_OTLP_ENABLED=true
networks:
- observability
# Prometheus - Metrics Collection
prometheus:
image: prom/prometheus:v2.45.0
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
networks:
- observability
# Grafana - Visualization
grafana:
image: grafana/grafana:10.1.0
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
networks:
- observability
# Elasticsearch - Log Storage
elasticsearch:
image: docker.elastic.co/elasticsearch/elasticsearch:8.9.0
ports:
- "9200:9200"
environment:
- discovery.type=single-node
- xpack.security.enabled=false
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
volumes:
- elasticsearch_data:/usr/share/elasticsearch/data
networks:
- observability
# Kibana - Log Visualization
kibana:
image: docker.elastic.co/kibana/kibana:8.9.0
ports:
- "5601:5601"
environment:
- ELASTICSEARCH_HOSTS=http://elasticsearch:9200
depends_on:
- elasticsearch
networks:
- observability
# Logstash - Log Processing
logstash:
image: docker.elastic.co/logstash/logstash:8.9.0
ports:
- "5044:5044"
- "12201:12201/udp"
volumes:
- ./logstash/pipeline:/usr/share/logstash/pipeline
- ./logstash/config:/usr/share/logstash/config
depends_on:
- elasticsearch
networks:
- observability
volumes:
prometheus_data:
grafana_data:
elasticsearch_data:
networks:
observability:
driver: bridge
Prometheus Configuration
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "alert_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'order-service'
static_configs:
- targets: ['order-service:9464']
metrics_path: /metrics
scrape_interval: 10s
- job_name: 'inventory-service'
static_configs:
- targets: ['inventory-service:9464']
- job_name: 'payment-service'
static_configs:
- targets: ['payment-service:9464']
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
Grafana Dashboard
{
"dashboard": {
"title": "Microservices Observability",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{service}} - {{method}} {{route}}"
}
]
},
{
"title": "Response Time (95th percentile)",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m]))",
"legendFormat": "{{service}} - 95th percentile"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_errors_total[5m]) / rate(http_requests_total[5m])",
"legendFormat": "{{service}} - Error Rate"
}
]
},
{
"title": "Service Map",
"type": "nodeGraph",
"targets": [
{
"expr": "sum(rate(traces_spanmetrics_calls_total[5m])) by (service_name, span_name)"
}
]
}
]
}
}
Best Practices
1. Sampling Strategy
// Intelligent sampling
const { TraceIdRatioBasedSampler, ParentBasedSampler } = require('@opentelemetry/sdk-trace-base');
class IntelligentSampler {
constructor() {
this.errorSampler = new AlwaysOnSampler(); // Always sample errors
this.slowRequestSampler = new AlwaysOnSampler(); // Always sample slow requests
this.normalSampler = new TraceIdRatioBasedSampler(0.1); // 10% sampling for normal requests
}
shouldSample(context, traceId, spanName, spanKind, attributes, links) {
// Always sample if there's an error
if (attributes['error'] || attributes['http.status_code'] >= 400) {
return this.errorSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
}
// Always sample slow requests
if (attributes['http.response_time_ms'] > 1000) {
return this.slowRequestSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
}
// For critical endpoints, use higher sampling rate
const criticalEndpoints = ['/orders', '/payments', '/checkout'];
if (criticalEndpoints.some(endpoint => spanName.includes(endpoint))) {
return new TraceIdRatioBasedSampler(0.5).shouldSample(context, traceId, spanName, spanKind, attributes, links);
}
// Default sampling
return this.normalSampler.shouldSample(context, traceId, spanName, spanKind, attributes, links);
}
}
2. Context Propagation
// Custom context propagation
class BusinessContextPropagator {
inject(context, carrier, setter) {
const span = trace.getSpanFromContext(context);
if (span) {
// Propagate business context
setter.set(carrier, 'x-user-id', span.getAttribute('user.id'));
setter.set(carrier, 'x-tenant-id', span.getAttribute('tenant.id'));
setter.set(carrier, 'x-correlation-id', span.getAttribute('correlation.id'));
}
}
extract(context, carrier, getter) {
const userId = getter.get(carrier, 'x-user-id');
const tenantId = getter.get(carrier, 'x-tenant-id');
const correlationId = getter.get(carrier, 'x-correlation-id');
// Add to current span
const span = trace.getActiveSpan();
if (span && userId) {
span.setAttributes({
'user.id': userId,
'tenant.id': tenantId,
'correlation.id': correlationId
});
}
return context;
}
fields() {
return ['x-user-id', 'x-tenant-id', 'x-correlation-id'];
}
}
// Register the propagator
propagation.setGlobalPropagator(
new CompositePropagator({
propagators: [
new TraceContextPropagator(),
new BusinessContextPropagator()
]
})
);
3. Alert Rules
# alert_rules.yml
groups:
- name: microservices_alerts
rules:
- alert: HighErrorRate
expr: rate(http_errors_total[5m]) / rate(http_requests_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for service {{ $labels.service }}"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_ms_bucket[5m])) > 2000
for: 5m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}ms for service {{ $labels.service }}"
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "Service {{ $labels.instance }} has been down for more than 1 minute"
สรุป
OpenTelemetry เป็นเครื่องมือที่ทรงพลังสำหรับการสร้าง Observability ใน Microservices:
- Unified Observability: รวม Traces, Metrics และ Logs ไว้ในที่เดียว
- Vendor Agnostic: ไม่ผูกติดกับ Monitoring Tool ใดๆ
- Auto-Instrumentation: ติดตั้งง่าย ได้ข้อมูลทันที
- Custom Instrumentation: ปรับแต่งได้ตามความต้องการ
- Distributed Tracing: ติดตาม Request ข้าม Services ได้
- Performance Monitoring: วัดประสิทธิภาพแบบ Real-time
- Alerting: แจ้งเตือนเมื่อมีปัญหา
การใช้ OpenTelemetry ช่วยให้เราเข้าใจระบบได้ลึกขึ้น แก้ปัญหาได้เร็วขึ้น และปรับปรุงประสิทธิภาพได้อย่างมีเป้าหมาย ทำให้ระบบ Microservices มีความเสถียรและน่าเชื่อถือมากขึ้น!