Author : rgheorghe

/path/to/log.file

destination_index document_type:{"original": "document"}

python elasticsearch_to_rsyslog.py source_index destination_index

from elasticsearch import Elasticsearch
import json, socket, sys

source_cluster = ['server1', 'server2']
rsyslog_address = '127.0.0.1'
rsyslog_port = 5514

es = Elasticsearch(source_cluster,
      retry_on_timeout=True,
      max_retries=10)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((rsyslog_address, rsyslog_port))


result = es.search(index=sys.argv[1], scroll='1m', search_type='scan', size=500)

while True:
  res = es.scroll(scroll_id=result['_scroll_id'], scroll='1m')
  for hit in result['hits']['hits']:
    s.send(sys.argv[2] + ' ' + hit["_type"] + ':' + json.dumps(hit["_source"])+'\n')
  if not result['hits']['hits']:
    break

s.close()

def de_dot(my_dict):
  for key, value in my_dict.iteritems():
    if '.' in key:
      my_dict[key.replace('.','_')] = my_dict.pop(key)
    if type(value) is dict:
      my_dict[key] = de_dot(my_dict.pop(key))
  return my_dict

s.send(sys.argv[2] + ' ' + hit["_type"] + ':' + json.dumps(de_dot(hit["_source"]))+'\n')

# messages bigger than this are truncated
$maxMessageSize 10000000  # ~10MB

# load the TCP input and the ES output modules
module(load="imtcp")
module(load="omelasticsearch")

main_queue(
  # buffer up to 1M messages in memory
  queue.size="1000000"
  # these threads process messages and send them to Elasticsearch
  queue.workerThreads="4"
  # rsyslog processes messages in batches to avoid queue contention
  # this will also be the Elasticsearch bulk size
  queue.dequeueBatchSize="4000"
)

# we use templates to specify how the data sent to Elasticsearch looks like
template(name="document" type="list"){
  # the "msg" variable contains the document
  property(name="msg")
}
template(name="index" type="list"){
  # "hostname" has the index name
  property(name="hostname")
}
template(name="type" type="list"){
  # "syslogtag" has the type name
  property(name="syslogtag")
}

# start the TCP listener on the port we pointed the Python script to
input(type="imtcp" port="5514")

# sending data to Elasticsearch, using the templates defined earlier
action(type="omelasticsearch"
  template="document"
  dynSearchIndex="on" searchIndex="index"
  dynSearchType="on" searchType="type"
  server="localhost"  # destination Elasticsearch host
  serverport="9200"   # and port
  bulkmode="on"  # use the bulk API
  action.resumeretrycount="-1"  # retry indefinitely if Elasticsearch is unreachable
)

rsyslogd -i /var/run/rsyslog_reindexer.pid -f /home/me/rsyslog_reindexer.conf

bin/zookeeper-server-start.sh config/zookeeper.properties
bin/kafka-server-start.sh config/server.properties
bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic rsyslog_logstash
bin/logstash -f logstash.conf
module(load="imuxsock")  # will listen to your local syslog
module(load="imfile")    # if you want to tail files
module(load="omkafka")   # lets you send to Kafka

input(type="imfile"
  File="/opt/logs/example*.log"
  Tag="examplelogs"
)

template(name="json_lines" type="list" option.json="on") {
  constant(value="{")
  constant(value="\"timestamp\":\"")
  property(name="timereported" dateFormat="rfc3339")
  constant(value="\",\"message\":\"")
  property(name="msg")
  constant(value="\",\"host\":\"")
  property(name="hostname")
  constant(value="\",\"severity\":\"")
  property(name="syslogseverity-text")
  constant(value="\",\"facility\":\"")
  property(name="syslogfacility-text")
  constant(value="\",\"syslog-tag\":\"")
  property(name="syslogtag")
  constant(value="\"}")
}

main_queue(
  queue.workerthreads="1"      # threads to work on the queue
  queue.dequeueBatchSize="100" # max number of messages to process at once
  queue.size="10000"           # max queue size
)

action(
  broker=["localhost:9092"]
  type="omkafka"
  topic="rsyslog_logstash"
  template="json"
)

input {
  kafka {
    zk_connect => "localhost:2181"
    topic_id => "rsyslog_logstash"
  }
}

output {
  elasticsearch {
    hosts => "localhost" # it used to be "host" pre-2.0
    port => 9200
    #ssl => "true"
    #protocol => "http" # removed in 2.0
  }
}

# system logs
module(load="imuxsock")
module(load="imklog")
# file
module(load="imfile")
# parser
module(load="mmnormalize")
# sender
module(load="omelasticsearch")
input(type="imfile"
      File="/var/log/apache*.log"
      Tag="apache:"
)
main_queue(
  queue.workerThreads="4"
  queue.dequeueBatchSize="1000"
  queue.size="10000"
)
main_queue(
  queue.workerThreads="4"
  queue.dequeueBatchSize="1000"
  queue.highWatermark="500000"    # max no. of events to hold in memory
  queue.lowWatermark="200000"     # use memory queue again, when it's back to this level
  queue.spoolDirectory="/var/run/rsyslog/queues"  # where to write on disk
  queue.fileName="stats_ruleset"
  queue.maxDiskSpace="5g"        # it will stop at this much disk space
  queue.size="5000000"           # or this many messages
  queue.saveOnShutdown="on"      # save memory queue contents to disk when rsyslog is exiting
)
action(type="mmnormalize"
  rulebase="/opt/rsyslog/apache.rb"
)
version=2

rule=:%clientip:word% %ident:word% %auth:word% [%timestamp:char-to:]%] "%verb:word% %request:word% HTTP/%httpversion:float%" %response:number% %bytes:number% "%referrer:char-to:"%" "%agent:char-to:"%"%blob:rest%

head -1 /path/to/log.file | /usr/lib/lognorm/lognormalizer -r /path/to/rulebase.rb -e json
template(name="all-json" type="list"){
  property(name="$!all-json")
}
template(name="logstash-index"
  type="list") {
    constant(value="logstash-")
    property(name="timereported" dateFormat="rfc3339" position.from="1" position.to="4")
    constant(value=".")
    property(name="timereported" dateFormat="rfc3339" position.from="6" position.to="7")
    constant(value=".")
    property(name="timereported" dateFormat="rfc3339" position.from="9" position.to="10")
}
action(type="omelasticsearch"
  template="all-json"
  dynSearchIndex="on"
  searchIndex="logstash-index"
  searchType="apache"
  server="MY-ELASTICSEARCH-SERVER"
  bulkmode="on"
  action.resumeretrycount="-1"
)
template(name="plain-syslog"
  type="list") {
    constant(value="{")
      constant(value="\"timestamp\":\"")     property(name="timereported" dateFormat="rfc3339")
      constant(value="\",\"host\":\"")        property(name="hostname")
      constant(value="\",\"severity\":\"")    property(name="syslogseverity-text")
      constant(value="\",\"facility\":\"")    property(name="syslogfacility-text")
      constant(value="\",\"tag\":\"")   property(name="syslogtag" format="json")
      constant(value="\",\"message\":\"")    property(name="msg" format="json")
    constant(value="\"}")
}
if $parsesuccess == "OK" then {
 action(type="omelasticsearch"
  template="all-json"
  ...
 )
} else {
 action(type="omelasticsearch"
  template="plain-syslog"
  ...
 )
}
action(type="omelasticsearch"
  template="all-json or plain-syslog"
  searchIndex="LOGSENE-APP-TOKEN-GOES-HERE"
  searchType="apache"
  server="logsene-receiver.sematext.com"
  serverport="80"
  bulkmode="on"
  action.resumeretrycount="-1"
)
curl -XPUT 'logsene-receiver.sematext.com/_template/aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee_MyTemplate' -d '{
 "template" : "aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee*",
 "order" : 21,
 "mappings" : {
  "_default_" : {
   "properties" : {
    "timestamp" : { "type" : "date" }
   }
  },
  "firstapp" : {
   "properties" : {
    "user" : { "type" : "string" }
   }
  },
  "secondapp" : {
   "properties" : {
    "amount" : { "type" : "long" }
   }
  }
 }
}'
module(load="imuxsock")        # can listen to local syslog socket
module(load="omelasticsearch") # can forward to Elasticsearch
module(load="mmjsonparse")     # can parse JSON

action(type="mmjsonparse")  # parse CEE-formatted messages

template(name="syslog-cee" type="list") {  # Elasticsearch documents will contain
  property(name="$!all-json")              # all JSON fields that were parsed
}

action(
  type="omelasticsearch"
  template="syslog-cee"                     # use the template defined earlier
  server="logsene-receiver.sematext.com"
  serverport="80"
  searchType="syslogapp"
  searchIndex="LOGSENE-APP-TOKEN-GOES-HERE"
  bulkmode="on"                                # send logs in batches
  queue.dequeuebatchsize="1000"                # of up to 1000
  action.resumeretrycount="-1"    # retry indefinitely (buffer) if destination is unreachable
)
module(load="imtcp")   # TCP input module
module(load="omelasticsearch") # Elasticsearch output module

input(type="imtcp" port="13514")  # where to listen for TCP messages

main_queue(
  queue.size="1000000"   # capacity of the main queue
  queue.dequeuebatchsize="1000"  # process messages in batches of 1000 and move them to the action queues
  queue.workerthreads="2"  # 2 threads for the main queue
)

# template to generate JSON documents for Elasticsearch in Logstash format
template(name="plain-syslog"
         type="list") {
           constant(value="{")
             constant(value="\"@timestamp\":\"")      property(name="timereported" dateFormat="rfc3339")
             constant(value="\",\"host\":\"")        property(name="hostname")
             constant(value="\",\"severity\":\"")    property(name="syslogseverity-text")
             constant(value="\",\"facility\":\"")    property(name="syslogfacility-text")
             constant(value="\",\"syslogtag\":\"")   property(name="syslogtag" format="json")
             constant(value="\",\"message\":\"")    property(name="msg" format="json")
             constant(value="\"}")
         }

action(type="omelasticsearch"
       template="plain-syslog"  # use the template defined earlier
       searchIndex="test-index"
       bulkmode="on"                   # use the Bulk API
       queue.dequeuebatchsize="5000"   # ES bulk size
       queue.size="100000"   # capacity of the action queue
       queue.workerthreads="5"   # 5 workers for the action
       action.resumeretrycount="-1"  # retry indefinitely if ES is unreachable
)
module(load="imuxsock")             # for listening to /dev/log
module(load="omelasticsearch") # for outputting to Elasticsearch
# this is for index names to be like: logstash-YYYY.MM.DD
template(name="logstash-index"
  type="list") {
    constant(value="logstash-")
    property(name="timereported" dateFormat="rfc3339" position.from="1" position.to="4")
    constant(value=".")
    property(name="timereported" dateFormat="rfc3339" position.from="6" position.to="7")
    constant(value=".")
    property(name="timereported" dateFormat="rfc3339" position.from="9" position.to="10")
}

# this is for formatting our syslog in JSON with @timestamp
template(name="plain-syslog"
  type="list") {
    constant(value="{")
      constant(value="\"@timestamp\":\"")     property(name="timereported" dateFormat="rfc3339")
      constant(value="\",\"host\":\"")        property(name="hostname")
      constant(value="\",\"severity\":\"")    property(name="syslogseverity-text")
      constant(value="\",\"facility\":\"")    property(name="syslogfacility-text")
      constant(value="\",\"tag\":\"")   property(name="syslogtag" format="json")
      constant(value="\",\"message\":\"")    property(name="msg" format="json")
    constant(value="\"}")
}
# this is where we actually send the logs to Elasticsearch (localhost:9200 by default)
action(type="omelasticsearch"
    template="plain-syslog"
    searchIndex="logstash-index"
    dynSearchIndex="on")
#load needed modules
module(load="imuxsock") # provides support for local system logging
module(load="imklog") # provides kernel logging support
module(load="mmjsonparse") #for parsing CEE-enhanced syslog messages

#try to parse structured logs
*.* :mmjsonparse:

#define a template to print field "foo"
template(name="justFoo" type="list") {
  property(name="$!foo")
  constant(value="\n") #we'll separate logs with a newline
}

#and now let's write the contents of field "foo" in a file
*.* action(type="omfile"
           template="justFoo"
           file="/var/log/foo")
#load needed modules
module(load="imuxsock") # provides support for local system logging
module(load="imklog") # provides kernel logging support
module(load="mmjsonparse") #for parsing CEE-enhanced syslog messages
module(load="omelasticsearch") #for indexing to Elasticsearch

#try to parse a structured log
*.* :mmjsonparse:

#define a template to print all fields of the message
template(name="messageToES" type="list") {
  property(name="$!all-json")
}

#write the JSON message to the local ES node
*.* action(type="omelasticsearch"
           template="messageToES")
template(name="customTemplate"
   type="list") {
#- open the curly brackets,
#- add the timestamp field surrounded with quotes
#- add the colon which separates field from value
#- open the quotes for the timestamp itself
   constant(value="{\"timestamp\":\"")
#- add the timestamp from the log,
# format it in RFC-3339, so that ES detects it by default
   property(name="timereported" dateFormat="rfc3339")
#- close the quotes for timestamp,
#- add a comma, then the syslogtag field in the same manner
   constant(value="\",\"syslogtag\":\"")
#- now the syslogtag field itself
# and format="json" will ensure special characters
# are escaped so they won't break our JSON
   property(name="syslogtag" format="json")
#- close the quotes for syslogtag
#- add a comma
#- then add our JSON-formatted syslog message,
# but start from the 2nd position to omit the left
# curly bracket
   constant(value="\",")
   property(name="$!all-json" position.from="2")
}

Author : rgheorghe

Writing the custom script

Configuring rsyslog

Getting the ingredients

Configuring rsyslog

Configuring Logstash

Getting the ingredients

Loading modules

Configure inputs

Queue and workers

Parsing with mmnormalize

Template for parsed logs

Template for time-based indices

Putting both Apache and system logs together

Elasticsearch Mapping and Logs

Managing Multiple Types

Sending JSON Logs to Specific Types

Filtering by Type

The Weapon and the Target

Rsyslog Architecture Overview

Configuration

Results

Conclusion

Getting all the ingredients

Putting them all together

More tips

Original post: Structured Logging with rsyslog and Elasticsearch via @sematext

On structured logging

Enter CEE and Lumberjack: structured logging with syslog

CEE-enhanced syslog with rsyslog

Indexing logs in Elasticsearch

Including other properties

Summary