You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ext_otel.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259
  1. import atexit
  2. import contextlib
  3. import logging
  4. import os
  5. import platform
  6. import socket
  7. import sys
  8. from typing import Union
  9. import flask
  10. from celery.signals import worker_init
  11. from flask_login import user_loaded_from_request, user_logged_in # type: ignore
  12. from configs import dify_config
  13. from dify_app import DifyApp
  14. from libs.helper import extract_tenant_id
  15. from models import Account, EndUser
  16. logger = logging.getLogger(__name__)
  17. @user_logged_in.connect
  18. @user_loaded_from_request.connect
  19. def on_user_loaded(_sender, user: Union["Account", "EndUser"]):
  20. if dify_config.ENABLE_OTEL:
  21. from opentelemetry.trace import get_current_span
  22. if user:
  23. try:
  24. current_span = get_current_span()
  25. tenant_id = extract_tenant_id(user)
  26. if not tenant_id:
  27. return
  28. if current_span:
  29. current_span.set_attribute("service.tenant.id", tenant_id)
  30. current_span.set_attribute("service.user.id", user.id)
  31. except Exception:
  32. logger.exception("Error setting tenant and user attributes")
  33. pass
  34. def init_app(app: DifyApp):
  35. from opentelemetry.semconv.trace import SpanAttributes
  36. def is_celery_worker():
  37. return "celery" in sys.argv[0].lower()
  38. def instrument_exception_logging():
  39. exception_handler = ExceptionLoggingHandler()
  40. logging.getLogger().addHandler(exception_handler)
  41. def init_flask_instrumentor(app: DifyApp):
  42. meter = get_meter("http_metrics", version=dify_config.project.version)
  43. _http_response_counter = meter.create_counter(
  44. "http.server.response.count",
  45. description="Total number of HTTP responses by status code, method and target",
  46. unit="{response}",
  47. )
  48. def response_hook(span: Span, status: str, response_headers: list):
  49. if span and span.is_recording():
  50. try:
  51. if status.startswith("2"):
  52. span.set_status(StatusCode.OK)
  53. else:
  54. span.set_status(StatusCode.ERROR, status)
  55. status = status.split(" ")[0]
  56. status_code = int(status)
  57. status_class = f"{status_code // 100}xx"
  58. attributes: dict[str, str | int] = {"status_code": status_code, "status_class": status_class}
  59. request = flask.request
  60. if request and request.url_rule:
  61. attributes[SpanAttributes.HTTP_TARGET] = str(request.url_rule.rule)
  62. if request and request.method:
  63. attributes[SpanAttributes.HTTP_METHOD] = str(request.method)
  64. _http_response_counter.add(1, attributes)
  65. except Exception:
  66. logger.exception("Error setting status and attributes")
  67. pass
  68. instrumentor = FlaskInstrumentor()
  69. if dify_config.DEBUG:
  70. logger.info("Initializing Flask instrumentor")
  71. instrumentor.instrument_app(app, response_hook=response_hook)
  72. def init_sqlalchemy_instrumentor(app: DifyApp):
  73. with app.app_context():
  74. engines = list(app.extensions["sqlalchemy"].engines.values())
  75. SQLAlchemyInstrumentor().instrument(enable_commenter=True, engines=engines)
  76. def setup_context_propagation():
  77. # Configure propagators
  78. set_global_textmap(
  79. CompositePropagator(
  80. [
  81. TraceContextTextMapPropagator(), # W3C trace context
  82. B3Format(), # B3 propagation (used by many systems)
  83. ]
  84. )
  85. )
  86. def shutdown_tracer():
  87. provider = trace.get_tracer_provider()
  88. if hasattr(provider, "force_flush"):
  89. provider.force_flush()
  90. class ExceptionLoggingHandler(logging.Handler):
  91. """Custom logging handler that creates spans for logging.exception() calls"""
  92. def emit(self, record: logging.LogRecord):
  93. with contextlib.suppress(Exception):
  94. if record.exc_info:
  95. tracer = get_tracer_provider().get_tracer("dify.exception.logging")
  96. with tracer.start_as_current_span(
  97. "log.exception",
  98. attributes={
  99. "log.level": record.levelname,
  100. "log.message": record.getMessage(),
  101. "log.logger": record.name,
  102. "log.file.path": record.pathname,
  103. "log.file.line": record.lineno,
  104. },
  105. ) as span:
  106. span.set_status(StatusCode.ERROR)
  107. if record.exc_info[1]:
  108. span.record_exception(record.exc_info[1])
  109. span.set_attribute("exception.message", str(record.exc_info[1]))
  110. if record.exc_info[0]:
  111. span.set_attribute("exception.type", record.exc_info[0].__name__)
  112. from opentelemetry import trace
  113. from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter as GRPCMetricExporter
  114. from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter as GRPCSpanExporter
  115. from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter as HTTPMetricExporter
  116. from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter as HTTPSpanExporter
  117. from opentelemetry.instrumentation.celery import CeleryInstrumentor
  118. from opentelemetry.instrumentation.flask import FlaskInstrumentor
  119. from opentelemetry.instrumentation.redis import RedisInstrumentor
  120. from opentelemetry.instrumentation.requests import RequestsInstrumentor
  121. from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
  122. from opentelemetry.metrics import get_meter, get_meter_provider, set_meter_provider
  123. from opentelemetry.propagate import set_global_textmap
  124. from opentelemetry.propagators.b3 import B3Format
  125. from opentelemetry.propagators.composite import CompositePropagator
  126. from opentelemetry.sdk.metrics import MeterProvider
  127. from opentelemetry.sdk.metrics.export import ConsoleMetricExporter, PeriodicExportingMetricReader
  128. from opentelemetry.sdk.resources import Resource
  129. from opentelemetry.sdk.trace import TracerProvider
  130. from opentelemetry.sdk.trace.export import (
  131. BatchSpanProcessor,
  132. ConsoleSpanExporter,
  133. )
  134. from opentelemetry.sdk.trace.sampling import ParentBasedTraceIdRatio
  135. from opentelemetry.semconv.resource import ResourceAttributes
  136. from opentelemetry.trace import Span, get_tracer_provider, set_tracer_provider
  137. from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
  138. from opentelemetry.trace.status import StatusCode
  139. setup_context_propagation()
  140. # Initialize OpenTelemetry
  141. # Follow Semantic Convertions 1.32.0 to define resource attributes
  142. resource = Resource(
  143. attributes={
  144. ResourceAttributes.SERVICE_NAME: dify_config.APPLICATION_NAME,
  145. ResourceAttributes.SERVICE_VERSION: f"dify-{dify_config.project.version}-{dify_config.COMMIT_SHA}",
  146. ResourceAttributes.PROCESS_PID: os.getpid(),
  147. ResourceAttributes.DEPLOYMENT_ENVIRONMENT: f"{dify_config.DEPLOY_ENV}-{dify_config.EDITION}",
  148. ResourceAttributes.HOST_NAME: socket.gethostname(),
  149. ResourceAttributes.HOST_ARCH: platform.machine(),
  150. "custom.deployment.git_commit": dify_config.COMMIT_SHA,
  151. ResourceAttributes.HOST_ID: platform.node(),
  152. ResourceAttributes.OS_TYPE: platform.system().lower(),
  153. ResourceAttributes.OS_DESCRIPTION: platform.platform(),
  154. ResourceAttributes.OS_VERSION: platform.version(),
  155. }
  156. )
  157. sampler = ParentBasedTraceIdRatio(dify_config.OTEL_SAMPLING_RATE)
  158. provider = TracerProvider(resource=resource, sampler=sampler)
  159. set_tracer_provider(provider)
  160. exporter: Union[GRPCSpanExporter, HTTPSpanExporter, ConsoleSpanExporter]
  161. metric_exporter: Union[GRPCMetricExporter, HTTPMetricExporter, ConsoleMetricExporter]
  162. protocol = (dify_config.OTEL_EXPORTER_OTLP_PROTOCOL or "").lower()
  163. if dify_config.OTEL_EXPORTER_TYPE == "otlp":
  164. if protocol == "grpc":
  165. exporter = GRPCSpanExporter(
  166. endpoint=dify_config.OTLP_BASE_ENDPOINT,
  167. # Header field names must consist of lowercase letters, check RFC7540
  168. headers=(("authorization", f"Bearer {dify_config.OTLP_API_KEY}"),),
  169. insecure=True,
  170. )
  171. metric_exporter = GRPCMetricExporter(
  172. endpoint=dify_config.OTLP_BASE_ENDPOINT,
  173. headers=(("authorization", f"Bearer {dify_config.OTLP_API_KEY}"),),
  174. insecure=True,
  175. )
  176. else:
  177. headers = {"Authorization": f"Bearer {dify_config.OTLP_API_KEY}"} if dify_config.OTLP_API_KEY else None
  178. trace_endpoint = dify_config.OTLP_TRACE_ENDPOINT
  179. if not trace_endpoint:
  180. trace_endpoint = dify_config.OTLP_BASE_ENDPOINT + "/v1/traces"
  181. exporter = HTTPSpanExporter(
  182. endpoint=trace_endpoint,
  183. headers=headers,
  184. )
  185. metric_endpoint = dify_config.OTLP_METRIC_ENDPOINT
  186. if not metric_endpoint:
  187. metric_endpoint = dify_config.OTLP_BASE_ENDPOINT + "/v1/metrics"
  188. metric_exporter = HTTPMetricExporter(
  189. endpoint=metric_endpoint,
  190. headers=headers,
  191. )
  192. else:
  193. exporter = ConsoleSpanExporter()
  194. metric_exporter = ConsoleMetricExporter()
  195. provider.add_span_processor(
  196. BatchSpanProcessor(
  197. exporter,
  198. max_queue_size=dify_config.OTEL_MAX_QUEUE_SIZE,
  199. schedule_delay_millis=dify_config.OTEL_BATCH_EXPORT_SCHEDULE_DELAY,
  200. max_export_batch_size=dify_config.OTEL_MAX_EXPORT_BATCH_SIZE,
  201. export_timeout_millis=dify_config.OTEL_BATCH_EXPORT_TIMEOUT,
  202. )
  203. )
  204. reader = PeriodicExportingMetricReader(
  205. metric_exporter,
  206. export_interval_millis=dify_config.OTEL_METRIC_EXPORT_INTERVAL,
  207. export_timeout_millis=dify_config.OTEL_METRIC_EXPORT_TIMEOUT,
  208. )
  209. set_meter_provider(MeterProvider(resource=resource, metric_readers=[reader]))
  210. if not is_celery_worker():
  211. init_flask_instrumentor(app)
  212. CeleryInstrumentor(tracer_provider=get_tracer_provider(), meter_provider=get_meter_provider()).instrument()
  213. instrument_exception_logging()
  214. init_sqlalchemy_instrumentor(app)
  215. RedisInstrumentor().instrument()
  216. RequestsInstrumentor().instrument()
  217. atexit.register(shutdown_tracer)
  218. def is_enabled():
  219. return dify_config.ENABLE_OTEL
  220. @worker_init.connect(weak=False)
  221. def init_celery_worker(*args, **kwargs):
  222. if dify_config.ENABLE_OTEL:
  223. from opentelemetry.instrumentation.celery import CeleryInstrumentor
  224. from opentelemetry.metrics import get_meter_provider
  225. from opentelemetry.trace import get_tracer_provider
  226. tracer_provider = get_tracer_provider()
  227. metric_provider = get_meter_provider()
  228. if dify_config.DEBUG:
  229. logger.info("Initializing OpenTelemetry for Celery worker")
  230. CeleryInstrumentor(tracer_provider=tracer_provider, meter_provider=metric_provider).instrument()