{"host": "clouddumps1002.wikimedia.org", "state": "core_diff", "description": "Differences to core resources", "diff": {"full": {"total": 3735, "only_in_self": [], "only_in_other": [], "resource_diffs": [{"resource": "Nginx::Site[xmldumps]"}, {"resource": "File[/etc/nginx/sites-available/xmldumps]", "content": "--- /etc/nginx/sites-available/xmldumps.orig\n+++ /etc/nginx/sites-available/xmldumps\n@@ -4,6 +4,74 @@\n ##########################\n \n limit_conn_zone $remote_addr$http_user_agent zone=addr:10m;\n+\n+# The ECS spec just uses the HTTP version numbers, so we strip the HTTP/ prefix here.\n+map $server_protocol $ecs_http_version {\n+    \"HTTP/1.0\"  \"1.0\";\n+    \"HTTP/1.1\"  \"1.1\";\n+    \"HTTP/2.0\"  \"2.0\";\n+    default     $server_protocol;\n+}\n+\n+# Here we use the HTTP status code to map to event.outcome\n+map $status $ecs_event_outcome {\n+    ~^2  \"success\";\n+    ~^3  \"success\";\n+    ~^4  \"failure\";\n+    ~^5  \"failure\";\n+    default \"unknown\";\n+}\n+\n+# nginx has no way to emit a literal '$', which the Event Platform '$schema' field\n+# name requires, so we expose one via a map and reference it as '${dollar}schema'.\n+map $host $dollar {\n+    default \"$\";\n+}\n+\n+# This directive configures the ECS compatible JSON log format.\n+# The '$schema', 'meta' and 'dt' fields are required by the Event Platform so that\n+# these events can be ingested into the Data Lake (see T291645). 'dt' is the Event\n+# Platform time field; '@timestamp' is retained for ECS compatibility.\n+log_format ecs_json escape=json\n+    '{'\n+        '\"${dollar}schema\":\"/development/elastic/ecs/1.0.0\",'\n+        '\"meta\":{'\n+            '\"stream\":\"webrequest.dumps.dev0\",'\n+            '\"id\":\"$request_id\",'\n+            '\"dt\":\"$time_iso8601\",'\n+            '\"domain\":\"$host\"'\n+        '},'\n+        '\"dt\":\"$time_iso8601\",'\n+        '\"@timestamp\":\"$time_iso8601\",'\n+        '\"ecs\":{\"version\":\"1.11.0\"},'\n+        '\"event\":{'\n+            '\"dataset\":\"dumps.access\",'\n+            '\"kind\":\"event\",'\n+            '\"category\":\"web\",'\n+            '\"type\":\"access\",'\n+            '\"outcome\":\"$ecs_event_outcome\"'\n+        '},'\n+        '\"http\":{'\n+            '\"request\":{'\n+                '\"method\":\"$request_method\",'\n+                '\"referrer\":\"$http_referer\",'\n+                '\"bytes\":$request_length'\n+            '},'\n+            '\"response\":{'\n+                '\"status_code\":$status,'\n+                '\"body\":{\"bytes\":$body_bytes_sent}'\n+            '},'\n+            '\"version\":\"$ecs_http_version\"'\n+        '},'\n+        '\"url\":{'\n+            '\"original\":\"$request_uri\",'\n+            '\"path\":\"$uri\",'\n+            '\"query\":\"$query_string\"'\n+        '},'\n+        '\"source\":{\"ip\":\"$remote_addr\"},'\n+        '\"user_agent\":{\"original\":\"$http_user_agent\"},'\n+        '\"destination\":{\"address\":\"$host\"}'\n+    '}';\n \n server {\n    listen [::]:443 default ssl ipv6only=off;\n@@ -22,6 +90,12 @@\n    add_header Strict-Transport-Security \"max-age=106384710; includeSubDomains; preload\" always;\n \n    root /srv/dumps/xmldatadumps/public;\n+\n+   # We want to send access logs to syslog in addition to the standard log files.\n+   # The 'dumps-http' tag wil match with an entry in /etc/rsyslog.lookup.d/lookup_table_output.json\n+   # so these logs will be sent to the kafka-logging cluster.\n+   access_log /var/log/nginx/access.log; # duplicate of the standard configuration in the http block.\n+   access_log syslog:server=unix:/dev/log,facility=local7,tag=dumps-http,severity=info ecs_json;\n \n    add_header Server 'clouddumps1002.wikimedia.org' always;\n "}, {"resource": "File[/etc/rsyslog.d/30-output-kafka.conf]", "content": "--- /etc/rsyslog.d/30-output-kafka.conf.orig\n+++ /etc/rsyslog.d/30-output-kafka.conf\n@@ -13,6 +13,19 @@\n \n # define a template to be used by omkafka dynatopic\n template(name=\"kafka_topic\" type=\"string\" string=\"rsyslog-%syslogseverity-text%\")\n+\n+# Event Platform support (T291645): messages that carry a 'meta.stream' field are\n+# Event Platform events (e.g. ECS formatted logs). Produce them to the\n+# '<datacenter>.<meta.stream>' topic so they can be ingested into the Data Lake.\n+# The datacenter prefix is supplied by puppet; the stream suffix is read from the\n+# parsed json message via the '%!meta!stream%' property.\n+template(name=\"event_platform_topic\" type=\"string\" string=\"eqiad.%!meta!stream%\")\n+\n+# Emit the parsed json message verbatim (with $schema, meta and dt intact) so the\n+# event remains valid for Event Platform / Data Lake ingestion.\n+template(name=\"event_platform_json\" type=\"list\") {\n+  property(name=\"$!all-json\")\n+}\n \n # send to kafka if lookup table contains \"kafka\" for relevant programname\n # $.log_outputs defined by lookup table in lookup_output.conf\n@@ -42,6 +55,31 @@\n     # unfortunately rsyslog doesn't allow variables to be used as template\n     # names, so the kafka action is duplicated here.\n     if $parsesuccess == \"OK\" then {\n+        # Event Platform events carry a 'meta.stream' field. Produce these only to\n+        # the '<datacenter>.<meta.stream>' topic (T291645). Logstash consumes this\n+        # topic via an explicit kafka input rather than the 'rsyslog-*' pattern.\n+        if ($!meta!stream != \"\") then {\n+            action(type=\"omkafka\"\n+                   name=\"omkafka_event_platform\"\n+                   broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n+                   topic=\"event_platform_topic\"\n+                   dynatopic=\"on\"\n+                   dynatopic.cachesize=\"1000\"\n+                   partitions.auto=\"on\"\n+                   template=\"event_platform_json\"\n+                   queue.type=\"LinkedList\" queue.size=\"10000\" queue.filename=\"output_kafka_event_platform\"\n+                   queue.highWatermark=\"7000\" queue.lowWatermark=\"6000\"\n+                   queue.checkpointInterval=\"5\"\n+                   queue.maxDiskSpace=\"40960000\"\n+                   confParam=[ \"security.protocol=ssl\",\n+                               \"ssl.ca.location=/etc/ssl/certs/wmf-ca-certificates.crt\",\n+                               \"compression.codec=snappy\",\n+                               \"socket.timeout.ms=10000\",\n+                               \"socket.keepalive.enable=true\",\n+                               \"queue.buffering.max.ms=50\",\n+                               \"batch.num.messages=1000\" ]\n+            )\n+        } else {\n         action(type=\"omkafka\"\n                name=\"omkafka_syslog_cee\"\n                broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n@@ -62,6 +100,7 @@\n                            \"queue.buffering.max.ms=50\",\n                            \"batch.num.messages=1000\" ]\n         )\n+        }\n     } else {\n         # if ecs_170 in log_outputs, use that template to format\n         # non-json-formatted syslog events into an ecs-compatible form"}, {"resource": "Rsyslog::Conf[output_kafka]"}], "perc_changed": "0.11%"}, "core": {"total": 3735, "only_in_self": [], "only_in_other": [], "resource_diffs": [{"resource": "File[/etc/nginx/sites-available/xmldumps]", "content": "--- /etc/nginx/sites-available/xmldumps.orig\n+++ /etc/nginx/sites-available/xmldumps\n@@ -4,6 +4,74 @@\n ##########################\n \n limit_conn_zone $remote_addr$http_user_agent zone=addr:10m;\n+\n+# The ECS spec just uses the HTTP version numbers, so we strip the HTTP/ prefix here.\n+map $server_protocol $ecs_http_version {\n+    \"HTTP/1.0\"  \"1.0\";\n+    \"HTTP/1.1\"  \"1.1\";\n+    \"HTTP/2.0\"  \"2.0\";\n+    default     $server_protocol;\n+}\n+\n+# Here we use the HTTP status code to map to event.outcome\n+map $status $ecs_event_outcome {\n+    ~^2  \"success\";\n+    ~^3  \"success\";\n+    ~^4  \"failure\";\n+    ~^5  \"failure\";\n+    default \"unknown\";\n+}\n+\n+# nginx has no way to emit a literal '$', which the Event Platform '$schema' field\n+# name requires, so we expose one via a map and reference it as '${dollar}schema'.\n+map $host $dollar {\n+    default \"$\";\n+}\n+\n+# This directive configures the ECS compatible JSON log format.\n+# The '$schema', 'meta' and 'dt' fields are required by the Event Platform so that\n+# these events can be ingested into the Data Lake (see T291645). 'dt' is the Event\n+# Platform time field; '@timestamp' is retained for ECS compatibility.\n+log_format ecs_json escape=json\n+    '{'\n+        '\"${dollar}schema\":\"/development/elastic/ecs/1.0.0\",'\n+        '\"meta\":{'\n+            '\"stream\":\"webrequest.dumps.dev0\",'\n+            '\"id\":\"$request_id\",'\n+            '\"dt\":\"$time_iso8601\",'\n+            '\"domain\":\"$host\"'\n+        '},'\n+        '\"dt\":\"$time_iso8601\",'\n+        '\"@timestamp\":\"$time_iso8601\",'\n+        '\"ecs\":{\"version\":\"1.11.0\"},'\n+        '\"event\":{'\n+            '\"dataset\":\"dumps.access\",'\n+            '\"kind\":\"event\",'\n+            '\"category\":\"web\",'\n+            '\"type\":\"access\",'\n+            '\"outcome\":\"$ecs_event_outcome\"'\n+        '},'\n+        '\"http\":{'\n+            '\"request\":{'\n+                '\"method\":\"$request_method\",'\n+                '\"referrer\":\"$http_referer\",'\n+                '\"bytes\":$request_length'\n+            '},'\n+            '\"response\":{'\n+                '\"status_code\":$status,'\n+                '\"body\":{\"bytes\":$body_bytes_sent}'\n+            '},'\n+            '\"version\":\"$ecs_http_version\"'\n+        '},'\n+        '\"url\":{'\n+            '\"original\":\"$request_uri\",'\n+            '\"path\":\"$uri\",'\n+            '\"query\":\"$query_string\"'\n+        '},'\n+        '\"source\":{\"ip\":\"$remote_addr\"},'\n+        '\"user_agent\":{\"original\":\"$http_user_agent\"},'\n+        '\"destination\":{\"address\":\"$host\"}'\n+    '}';\n \n server {\n    listen [::]:443 default ssl ipv6only=off;\n@@ -22,6 +90,12 @@\n    add_header Strict-Transport-Security \"max-age=106384710; includeSubDomains; preload\" always;\n \n    root /srv/dumps/xmldatadumps/public;\n+\n+   # We want to send access logs to syslog in addition to the standard log files.\n+   # The 'dumps-http' tag wil match with an entry in /etc/rsyslog.lookup.d/lookup_table_output.json\n+   # so these logs will be sent to the kafka-logging cluster.\n+   access_log /var/log/nginx/access.log; # duplicate of the standard configuration in the http block.\n+   access_log syslog:server=unix:/dev/log,facility=local7,tag=dumps-http,severity=info ecs_json;\n \n    add_header Server 'clouddumps1002.wikimedia.org' always;\n "}, {"resource": "File[/etc/rsyslog.d/30-output-kafka.conf]", "content": "--- /etc/rsyslog.d/30-output-kafka.conf.orig\n+++ /etc/rsyslog.d/30-output-kafka.conf\n@@ -13,6 +13,19 @@\n \n # define a template to be used by omkafka dynatopic\n template(name=\"kafka_topic\" type=\"string\" string=\"rsyslog-%syslogseverity-text%\")\n+\n+# Event Platform support (T291645): messages that carry a 'meta.stream' field are\n+# Event Platform events (e.g. ECS formatted logs). Produce them to the\n+# '<datacenter>.<meta.stream>' topic so they can be ingested into the Data Lake.\n+# The datacenter prefix is supplied by puppet; the stream suffix is read from the\n+# parsed json message via the '%!meta!stream%' property.\n+template(name=\"event_platform_topic\" type=\"string\" string=\"eqiad.%!meta!stream%\")\n+\n+# Emit the parsed json message verbatim (with $schema, meta and dt intact) so the\n+# event remains valid for Event Platform / Data Lake ingestion.\n+template(name=\"event_platform_json\" type=\"list\") {\n+  property(name=\"$!all-json\")\n+}\n \n # send to kafka if lookup table contains \"kafka\" for relevant programname\n # $.log_outputs defined by lookup table in lookup_output.conf\n@@ -42,6 +55,31 @@\n     # unfortunately rsyslog doesn't allow variables to be used as template\n     # names, so the kafka action is duplicated here.\n     if $parsesuccess == \"OK\" then {\n+        # Event Platform events carry a 'meta.stream' field. Produce these only to\n+        # the '<datacenter>.<meta.stream>' topic (T291645). Logstash consumes this\n+        # topic via an explicit kafka input rather than the 'rsyslog-*' pattern.\n+        if ($!meta!stream != \"\") then {\n+            action(type=\"omkafka\"\n+                   name=\"omkafka_event_platform\"\n+                   broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n+                   topic=\"event_platform_topic\"\n+                   dynatopic=\"on\"\n+                   dynatopic.cachesize=\"1000\"\n+                   partitions.auto=\"on\"\n+                   template=\"event_platform_json\"\n+                   queue.type=\"LinkedList\" queue.size=\"10000\" queue.filename=\"output_kafka_event_platform\"\n+                   queue.highWatermark=\"7000\" queue.lowWatermark=\"6000\"\n+                   queue.checkpointInterval=\"5\"\n+                   queue.maxDiskSpace=\"40960000\"\n+                   confParam=[ \"security.protocol=ssl\",\n+                               \"ssl.ca.location=/etc/ssl/certs/wmf-ca-certificates.crt\",\n+                               \"compression.codec=snappy\",\n+                               \"socket.timeout.ms=10000\",\n+                               \"socket.keepalive.enable=true\",\n+                               \"queue.buffering.max.ms=50\",\n+                               \"batch.num.messages=1000\" ]\n+            )\n+        } else {\n         action(type=\"omkafka\"\n                name=\"omkafka_syslog_cee\"\n                broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n@@ -62,6 +100,7 @@\n                            \"queue.buffering.max.ms=50\",\n                            \"batch.num.messages=1000\" ]\n         )\n+        }\n     } else {\n         # if ecs_170 in log_outputs, use that template to format\n         # non-json-formatted syslog events into an ecs-compatible form"}], "perc_changed": "0.05%"}, "main": {"total": 3735, "only_in_self": [], "only_in_other": [], "resource_diffs": [{"resource": "Nginx::Site[xmldumps]"}, {"resource": "Rsyslog::Conf[output_kafka]"}, {"resource": "File[/etc/nginx/sites-available/xmldumps]", "content": "--- /etc/nginx/sites-available/xmldumps.orig\n+++ /etc/nginx/sites-available/xmldumps\n@@ -4,6 +4,74 @@\n ##########################\n \n limit_conn_zone $remote_addr$http_user_agent zone=addr:10m;\n+\n+# The ECS spec just uses the HTTP version numbers, so we strip the HTTP/ prefix here.\n+map $server_protocol $ecs_http_version {\n+    \"HTTP/1.0\"  \"1.0\";\n+    \"HTTP/1.1\"  \"1.1\";\n+    \"HTTP/2.0\"  \"2.0\";\n+    default     $server_protocol;\n+}\n+\n+# Here we use the HTTP status code to map to event.outcome\n+map $status $ecs_event_outcome {\n+    ~^2  \"success\";\n+    ~^3  \"success\";\n+    ~^4  \"failure\";\n+    ~^5  \"failure\";\n+    default \"unknown\";\n+}\n+\n+# nginx has no way to emit a literal '$', which the Event Platform '$schema' field\n+# name requires, so we expose one via a map and reference it as '${dollar}schema'.\n+map $host $dollar {\n+    default \"$\";\n+}\n+\n+# This directive configures the ECS compatible JSON log format.\n+# The '$schema', 'meta' and 'dt' fields are required by the Event Platform so that\n+# these events can be ingested into the Data Lake (see T291645). 'dt' is the Event\n+# Platform time field; '@timestamp' is retained for ECS compatibility.\n+log_format ecs_json escape=json\n+    '{'\n+        '\"${dollar}schema\":\"/development/elastic/ecs/1.0.0\",'\n+        '\"meta\":{'\n+            '\"stream\":\"webrequest.dumps.dev0\",'\n+            '\"id\":\"$request_id\",'\n+            '\"dt\":\"$time_iso8601\",'\n+            '\"domain\":\"$host\"'\n+        '},'\n+        '\"dt\":\"$time_iso8601\",'\n+        '\"@timestamp\":\"$time_iso8601\",'\n+        '\"ecs\":{\"version\":\"1.11.0\"},'\n+        '\"event\":{'\n+            '\"dataset\":\"dumps.access\",'\n+            '\"kind\":\"event\",'\n+            '\"category\":\"web\",'\n+            '\"type\":\"access\",'\n+            '\"outcome\":\"$ecs_event_outcome\"'\n+        '},'\n+        '\"http\":{'\n+            '\"request\":{'\n+                '\"method\":\"$request_method\",'\n+                '\"referrer\":\"$http_referer\",'\n+                '\"bytes\":$request_length'\n+            '},'\n+            '\"response\":{'\n+                '\"status_code\":$status,'\n+                '\"body\":{\"bytes\":$body_bytes_sent}'\n+            '},'\n+            '\"version\":\"$ecs_http_version\"'\n+        '},'\n+        '\"url\":{'\n+            '\"original\":\"$request_uri\",'\n+            '\"path\":\"$uri\",'\n+            '\"query\":\"$query_string\"'\n+        '},'\n+        '\"source\":{\"ip\":\"$remote_addr\"},'\n+        '\"user_agent\":{\"original\":\"$http_user_agent\"},'\n+        '\"destination\":{\"address\":\"$host\"}'\n+    '}';\n \n server {\n    listen [::]:443 default ssl ipv6only=off;\n@@ -22,6 +90,12 @@\n    add_header Strict-Transport-Security \"max-age=106384710; includeSubDomains; preload\" always;\n \n    root /srv/dumps/xmldatadumps/public;\n+\n+   # We want to send access logs to syslog in addition to the standard log files.\n+   # The 'dumps-http' tag wil match with an entry in /etc/rsyslog.lookup.d/lookup_table_output.json\n+   # so these logs will be sent to the kafka-logging cluster.\n+   access_log /var/log/nginx/access.log; # duplicate of the standard configuration in the http block.\n+   access_log syslog:server=unix:/dev/log,facility=local7,tag=dumps-http,severity=info ecs_json;\n \n    add_header Server 'clouddumps1002.wikimedia.org' always;\n "}, {"resource": "File[/etc/rsyslog.d/30-output-kafka.conf]", "content": "--- /etc/rsyslog.d/30-output-kafka.conf.orig\n+++ /etc/rsyslog.d/30-output-kafka.conf\n@@ -13,6 +13,19 @@\n \n # define a template to be used by omkafka dynatopic\n template(name=\"kafka_topic\" type=\"string\" string=\"rsyslog-%syslogseverity-text%\")\n+\n+# Event Platform support (T291645): messages that carry a 'meta.stream' field are\n+# Event Platform events (e.g. ECS formatted logs). Produce them to the\n+# '<datacenter>.<meta.stream>' topic so they can be ingested into the Data Lake.\n+# The datacenter prefix is supplied by puppet; the stream suffix is read from the\n+# parsed json message via the '%!meta!stream%' property.\n+template(name=\"event_platform_topic\" type=\"string\" string=\"eqiad.%!meta!stream%\")\n+\n+# Emit the parsed json message verbatim (with $schema, meta and dt intact) so the\n+# event remains valid for Event Platform / Data Lake ingestion.\n+template(name=\"event_platform_json\" type=\"list\") {\n+  property(name=\"$!all-json\")\n+}\n \n # send to kafka if lookup table contains \"kafka\" for relevant programname\n # $.log_outputs defined by lookup table in lookup_output.conf\n@@ -42,6 +55,31 @@\n     # unfortunately rsyslog doesn't allow variables to be used as template\n     # names, so the kafka action is duplicated here.\n     if $parsesuccess == \"OK\" then {\n+        # Event Platform events carry a 'meta.stream' field. Produce these only to\n+        # the '<datacenter>.<meta.stream>' topic (T291645). Logstash consumes this\n+        # topic via an explicit kafka input rather than the 'rsyslog-*' pattern.\n+        if ($!meta!stream != \"\") then {\n+            action(type=\"omkafka\"\n+                   name=\"omkafka_event_platform\"\n+                   broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n+                   topic=\"event_platform_topic\"\n+                   dynatopic=\"on\"\n+                   dynatopic.cachesize=\"1000\"\n+                   partitions.auto=\"on\"\n+                   template=\"event_platform_json\"\n+                   queue.type=\"LinkedList\" queue.size=\"10000\" queue.filename=\"output_kafka_event_platform\"\n+                   queue.highWatermark=\"7000\" queue.lowWatermark=\"6000\"\n+                   queue.checkpointInterval=\"5\"\n+                   queue.maxDiskSpace=\"40960000\"\n+                   confParam=[ \"security.protocol=ssl\",\n+                               \"ssl.ca.location=/etc/ssl/certs/wmf-ca-certificates.crt\",\n+                               \"compression.codec=snappy\",\n+                               \"socket.timeout.ms=10000\",\n+                               \"socket.keepalive.enable=true\",\n+                               \"queue.buffering.max.ms=50\",\n+                               \"batch.num.messages=1000\" ]\n+            )\n+        } else {\n         action(type=\"omkafka\"\n                name=\"omkafka_syslog_cee\"\n                broker=[\"kafka-logging1001.eqiad.wmnet:9093\",\"kafka-logging1002.eqiad.wmnet:9093\",\"kafka-logging1003.eqiad.wmnet:9093\",\"kafka-logging1004.eqiad.wmnet:9093\",\"kafka-logging1005.eqiad.wmnet:9093\"]\n@@ -62,6 +100,7 @@\n                            \"queue.buffering.max.ms=50\",\n                            \"batch.num.messages=1000\" ]\n         )\n+        }\n     } else {\n         # if ecs_170 in log_outputs, use that template to format\n         # non-json-formatted syslog events into an ecs-compatible form"}], "perc_changed": "0.11%"}}}