Skip to content

www: add initial cloudflare log-collect tooling #987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup/www/ansible-vars.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ packages:
- git
- python-pip
- gnuplot
- golang-go
benchmark_host: 50.97.245.4
3 changes: 2 additions & 1 deletion setup/www/resources/scripts/sync-benchmarking.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
pidfile=/var/run/nodejs/sync-benchmarking.pid
cmd="
rsync -aqz --delete benchmark:charts/ /home/www/benchmarking/charts/ &&
rsync -aqz --delete benchmark:coverage-out/out/ /home/www/coverage/
rsync -aqz --delete benchmark:coverage-out/out/ /home/www/coverage/ &&
scp benchmark:benchmarking/www/index.html /home/www/benchmarking/
"

if [ -a "$pidfile" -a -r "$pidfile" ]; then
Expand Down
2 changes: 1 addition & 1 deletion setup/www/tasks/base.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
- name: Base | Add the NodeSource Node.js repo
command: "bash -c 'curl -sL https://deb.nodesource.com/setup_6.x | bash -'"
command: "bash -c 'curl -sL https://deb.nodesource.com/setup_8.x | bash -'"
tags: base

- name: Base | APT Update
Expand Down
51 changes: 47 additions & 4 deletions setup/www/tasks/tools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,59 @@
mode: 0644
tags: nightly-builder

- name: log-collect | Make /home/logs/log-collect/
file:
path: /home/logs/log-collect/
state: directory
mode: 0750
owner: root
tags: log-collect

- name: log-collect | Copy log-collect.sh script
template:
src: ./resources/scripts/log-collect.sh.j2
dest: /home/logs/log-collect/log-collect.sh
mode: 0700
owner: root
group: root
tags: log-collect

- name: log-collect | Install Cloudflare logshare-cli
command: "go get github.com/cloudflare/logshare/..."
tags: log-collect

- name: log-collect | Install log-collect Node dependencies
command: "npm install"
args:
chdir: /home/logs/log-collect/
tags: log-collect

- name: log-collect | Copy remaining log-collect components
copy:
src: './tools/log-collect/{{ item }}'
dest: '/home/logs/log-collect/{{ item }}'
mode: 0755
owner: root
group: root
with_items:
- json-log-filter.js
- json-to-nginx.js
- nginxify.sh
- package.json
- sanity-check-logs.sh
tags: log-collect

#TODO: the branches are hardwired here, they should be in vars somewhere
- name: tools | Add periodic tasks to crontab
lineinfile:
dest: /etc/crontab
line: "{{ item }}"
with_items:
- '0 20 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/v6.x --config /etc/nightly-builder.json'
- '0 19 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/v7.x --config /etc/nightly-builder.json'
- '0 20 * * * dist /usr/bin/nodejs-nightly-builder --type v8-canary --ref heads/canary --config /etc/nightly-builder-v8-canary.json'
- '0 19 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/v9.x-staging --config /etc/nightly-builder.json'
- '0 18 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/master --config /etc/nightly-builder.json'
- '0 11 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/v8.x --config /etc/nightly-builder-chakracore.json'
- '0 11 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/v9.x --config /etc/nightly-builder-chakracore.json'
- '0 10 * * * dist /usr/bin/nodejs-nightly-builder --type nightly --ref heads/master --config /etc/nightly-builder-chakracore.json'
- '1 0 * * * root npm update -g nodejs-latest-linker nodejs-dist-indexer nodejs-nightly-builder >& /dev/null'
- '55 * * * * root /home/logs/log-collect/log-collect.sh'
- '30 1,13 * * * root /home/logs/log-collect/sanity-check-logs.sh'
tags: tools
58 changes: 58 additions & 0 deletions setup/www/tools/log-collect/json-log-filter.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env node


const through2 = require('through2')
, split2 = require('split2')


if (process.argv.length < 3) {
console.error('cat log | json-log-filter.js [<regex>[, <regex>[, <regex>...]]]')
process.exit(1)
}


const matches = process.argv.slice(2).map(function argvToGlob (s) {
return new RegExp(s)
})


const jsonify = through2.obj(function jsonifyLine (chunk, enc, callback) {
let data = null
try {
data = JSON.parse(chunk)
} catch (e) {}
callback(null, data)
})


const filterResponse = through2.obj(function filterResponseLine (chunk, enc, callback) {
if (((chunk.edgeResponse && chunk.edgeResponse.status == 200) ||
(chunk.originResponse && chunk.originResponse.status == 200)) &&
chunk.edgeResponse.bytes > 10) {
return callback(null, chunk)
}
callback()
})


const filterUri = through2.obj(function filterUriLine (chunk, enc, callback) {
for (let i = 0; i < matches.length; i++)
if (matches[i].test(chunk.clientRequest.uri))
return callback(null, chunk)
callback()
})


const outWrite = through2.obj(function outWriteLine (chunk, enc, callback) {
callback(null, `${JSON.stringify(chunk)}\n`)
})


process.stdin
.pipe(split2())
.pipe(jsonify)
.pipe(filterResponse)
.pipe(filterUri)
.pipe(outWrite)
.pipe(process.stdout)

46 changes: 46 additions & 0 deletions setup/www/tools/log-collect/json-to-nginx.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env node

// log_format nodejs '$remote_addr - $remote_user [$time_local] '
// '"$request" $status $body_bytes_sent '
// '"$http_referer" "$http_user_agent" "$http_x_forwarded_for"';
//0.0.0.0 - - [14/Apr/2017:09:25:15 +0000] "GET /en/docs/es6/ HTTP/1.1" 200 4285 "https://www.google.com/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/0.0.0.0 Safari/537.36" "0.0.0.0"

//{"brandId":100,"flags":2,"hosterId":0,"ownerId":2598267,"rayId":"34f5210c4b2f0b56","securityLevel":"eoff","timestamp":1492156801968000000,"unstable":null,"zoneId":15832072,"zoneName":"nodejs.org","zonePlan":"business","cache":{"bckType":"byc","cacheExternalIp":"0.0.0.0","cacheExternalPort":26380,"cacheFileKey":null,"cacheInternalIp":"0.0.0.0","cacheServerName":"22c24","cacheStatus":"unknown","cacheTokens":0,"endTimestamp":1492156803756000000,"startTimestamp":1492156801971000000},"cacheRequest":{"headers":null,"keepaliveStatus":"noReuseOpenFailed"},"cacheResponse":{"bodyBytes":0,"bytes":476907,"contentType":"application/gzip","retriedStatus":0,"status":200},"client":{"asNum":16509,"country":"jp","deviceType":"desktop","ip":"0.0.0.0","ipClass":"noRecord","srcPort":42727,"sslCipher":"ECDHE-RSA-AES128-GCM-SHA256","sslFlags":1,"sslProtocol":"TLSv1.2"},"clientRequest":{"accept":"","body":null,"bodyBytes":0,"bytes":1056,"cookies":null,"flags":0,"headers":[],"httpHost":"nodejs.org","httpMethod":"GET","httpProtocol":"HTTP/1.1","referer":"","uri":"/download/release/v6.9.2/node-v6.9.2-headers.tar.gz","userAgent":"node-gyp v3.4.0 (node v6.9.2)"},"edge":{"bbResult":"0","cacheResponseTime":2775000000,"colo":22,"enabledFlags":8,"endTimestamp":1492156804744000000,"flServerIp":"0.0.0.0","flServerName":"22f20","flServerPort":443,"pathingOp":"wl","pathingSrc":"macro","pathingStatus":"nr","startTimestamp":1492156801968000000,"usedFlags":0,"rateLimit":{"ruleId":0,"mitigationId":null,"sourceId":"","processedRules":null},"dnsResponse":{"rcode":0,"error":"ok","cached":true,"duration":0,"errorMsg":"","overrideError":false}},"edgeRequest":{"bodyBytes":0,"bytes":1857,"headers":null,"httpHost":"nodejs.org","httpMethod":"GET","keepaliveStatus":"reuseAccepted","uri":"/download/release/v6.9.2/node-v6.9.2-headers.tar.gz"},"edgeResponse":{"bodyBytes":476907,"bytes":477355,"compressionRatio":0,"contentType":"application/gzip","headers":null,"setCookies":null,"status":200},"origin":{"asNum":46652,"ip":"0.0.0.0","port":443,"responseTime":0,"sslCipher":"UNK","sslProtocol":"unknown"},"originResponse":{"bodyBytes":0,"bytes":0,"flags":0,"headers":[],"httpExpires":0,"httpLastModified":1481052127000000000,"status":200}}

const through2 = require('through2')
, split2 = require('split2')
, strftime = require('strftime').timezone(0)
, timefmt = '%e/%b/%Y:%H:%M:%S +0000'


const jsonify = through2.obj(function jsonifyLine (chunk, enc, callback) {
let data = null
try {
data = JSON.parse(chunk)
} catch (e) {}
callback(null, data)
})


const transform = through2.obj(function transformLine (chunk, enc, callback) {
let remote_addr = chunk.client.ip
, country = chunk.client.country
, remote_user = '-'
, time_local = strftime(timefmt, new Date(chunk.timestamp / 1000000))
, request = `${chunk.clientRequest.httpMethod} ${chunk.clientRequest.uri} ${chunk.clientRequest.httpProtocol}`
, status = chunk.edgeResponse.status
, body_bytes_sent = chunk.edgeResponse.bodyBytes
, http_referer = chunk.clientRequest.referer.replace(/"/g, '\\"')
, http_user_agent = chunk.clientRequest.userAgent.replace(/"/g, '\\"')
, http_x_forwarded_for = remote_addr

callback(null, `${remote_addr} ${country} ${remote_user} [${time_local}] "${request}" ${status} ${body_bytes_sent} "${http_referer}" "${http_user_agent}" "${http_x_forwarded_for}"\n`)
})


process.stdin
.pipe(split2())
.pipe(jsonify)
.pipe(transform)
.pipe(process.stdout)

65 changes: 65 additions & 0 deletions setup/www/tools/log-collect/log-collect.sh.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

set -e

export TZ=UTC

__dirname="$(CDPATH= cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

zone="nodejs.org"
apikey="{{ cdn_api_key }}"
apiemail="{{ cdn_api_email }}"
logregex='^.*\.(tar\.xz|tar\.gz|exe|msi|pkg|7z|zip|lib|pdb|exp)(\?.*)?$'
logdir="/home/logs/$zone"
logshare="/usr/local/go/bin/logshare-cli"
xz="xz -z -9 -e"

fmt="%b %e %H:00:00 UTC %Y"
earliest=$(date --date="72 hours ago" +"%s")
end=$(date --date="+1 hour" +"$fmt")
overwritecount=0

mkdir -p $logdir

while true; do
# file format is <start timestamp>-<end timestamp>.log.xz and there's one per hour
# we start from the current hour, knowing that it's not yet complete, and work back
# until 72 hours from now, which is how long CloudFlare stores logs for
start=$(date --date="${end}-1 hour" +"$fmt")
startts=$(date --date="$start" +"%s")
endts=$(date --date="$end" +"%s")
if (( $startts < $earliest )); then break; fi

filename="${startts}-${endts}.log.xz"
tmpfile="/tmp/${filename}"
outfile="${logdir}/${filename}"
# we allow overwriting of the most recent 2 files, assuming that the last file wasn't
# complete and there's a possibility the second last one wasn't complete
# it's also possible that older files were not properly fetched or have been removed
# by the sanity checker so we make sure we redo any that are missing
if (( $overwritecount < 2 )) || [ ! -f $outfile ]; then
echo "Fetching ${filename}..."
# Filter the entries from CloudFlare so we only keep the download asset entries
$logshare --count=-1 --api-key="$apikey" --api-email="$apiemail" --zone-name="$zone" --start-time="$startts" --end-time="$endts" \
| ${__dirname}/json-log-filter.js "${logregex}" \
| $xz > $tmpfile
if (( $? != 0 )); then
echo "Failed to fetch logs"
exit 1
fi
# check file size as a sanity check, it should be in the range of 3-4Mb, it's possible
# to get a bad set from CloudFlare so we won't bother keeping files that are sufficiently
# suspicious
# bad reads should be rectified in the next run since this is run every hour
fsize=$(stat -c '%s' $tmpfile)
if (( $fsize < 500000 )); then
echo "Unsuccessful fetch of $filename"
rm "$tmpfile"
else
mv $tmpfile $outfile
overwritecount=$(expr $overwritecount + 1)
fi
fi

end="$start"
done
29 changes: 29 additions & 0 deletions setup/www/tools/log-collect/nginxify.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

set -e

export TZ=UTC

__dirname="$(CDPATH= cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

zone="nodejs.org"
logdir="/home/logs/$zone"
nginxlogdir="/home/logs/$zone/nginx-format"

fmt="%b %e %H:00:00 UTC %Y"
earliest=$(date --date="72 hours ago" +"%s")
end=$(date --date="+1 hour" +"$fmt")

while true; do
start=$(date --date="${end}-1 hour" +"$fmt")
startts=$(date --date="$start" +"%s")
endts=$(date --date="$end" +"%s")
if (( $startts < $earliest )); then break; fi

filename="${startts}-${endts}.log.xz"
nginxfilename="${startts}-${endts}.log"
infile="${logdir}/${filename}"
nginxfile="${nginxlogdir}/${nginxfilename}"

end="$start"
done
14 changes: 14 additions & 0 deletions setup/www/tools/log-collect/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"name": "json-log-filter",
"version": "1.0.0",
"description": "",
"main": "json-log-filter.js",
"author": "",
"license": "Apache-2.0",
"dependencies": {
"glob": "^7.1.1",
"split2": "^2.1.1",
"strftime": "^0.10.0",
"through2": "^2.0.3"
}
}
43 changes: 43 additions & 0 deletions setup/www/tools/log-collect/sanity-check-logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

set -e

fmt="%b %e %H:00:00 UTC %Y"
earliest=$(date --date="25 hours ago" +"%s")
end=$(date +"$fmt")
zone=nodejs.org
logdir="/home/logs/$zone"
latestminute=57

# Grab the last entry of the file, parse it and extract the timestamp,
# then check that the timestamp is reasonably late—on a busy site it should
# be in the 59th minute because we expect many requests per minute of the
# hour. If the time of the last entry seems too early then delete the file and
# expect the log fetcher to grab it again and hopefully get it right.
checkfile() {
local file=$1
local lastjson="$(xzcat $file | tail -1)"
local lastminute=$(node -p 'new Date(Math.round(JSON.parse(process.argv[1]).timestamp / 1000000)).getMinutes()' "$lastjson")
echo $file = $lastminute
if (( $lastminute < $latestminute )); then
echo "Bad file $file, last minute = $lastminute"
rm $file
fi
}

while true; do
# file format is <start timestamp>-<end timestamp>.log.xz and there's one per hour
# we start from the last hour and work back 25 hours (1 hour overlap for safety)
start=$(date --date="${end}-1 hour" +"$fmt")
startts=$(date --date="$start" +"%s")
endts=$(date --date="$end" +"%s")
if (( $startts < $earliest )); then break; fi

filename="${startts}-${endts}.log.xz"
logfile="${logdir}/${filename}"
if [ -f $logfile ]; then
checkfile $logfile
fi

end="$start"
done