Skip to content
Snippets Groups Projects
Commit 16707b50 authored by matthu017's avatar matthu017
Browse files

Refactored get_data to use splunk

start_searches added to process 3 searches at once
generate_reports now asks for a splunk user and password
parent 4e7b0c4b
No related branches found
No related tags found
1 merge request!2Refactored to use splunk
#!/bin/env ruby #!/bin/env ruby
require 'net/http'
require 'uri'
require 'csv'
require 'openssl'
require 'rexml/document'
include REXML
# #
# Set options here # Set options here
# #
OUTPUT = ENV["APP_DATA_FILE"] || "data.app.txt" OUTPUT = ENV["LOCATION_DATA_FILE"] || "data.location.txt"
HOST = ENV["HOST"] || "ondemand.osc.edu" SID_DIR = ENV["SID_DIR"] || "sid"
API_KEY = ENV["GOOGLE_API_KEY"] || "google_api_key.json" APP = ENV["APP"] || "ondemand"
GA_PROFILE = ENV["GOOGLE_PROFILE"] || "133342836" # Open OnDemand (beta) / Authenticated Users HOST = ENV["HOST"] || "ondemand.osc.edu"
START_DATE = ENV["START_DATE"] || "2016-11-29" # Server side analytics code deployed in OSC OnDemand production START_DATE = ENV["START_DATE"] || "2016-11-29"
END_DATE = ENV["END_DATE"] || "today" # google keyword is "today" END_DATE = ENV["END_DATE"] || "today"
START_STEP = ENV["START_STEP"] || 1 # google starts with 1, NOT 0 USER = ENV["USER"] || "admin"
STEP_SIZE = ENV["STEP_SIZE"] || 10000 # max google step is 10,000 PASS = ENV["PASS"] || "pass"
# max 7 dimensions
DIMENSIONS = %w( COUNT = 60000
ga:dimension3 BASEURL = 'https://splunk.osc.edu:8089'
ga:dimension1 DELIM = "|"
ga:pagePath
) TIME_COLUMN = 0
USR_COLUMN = 1 # ga:dimension1 USR_COLUMN = 1 # ga:dimension1
APP_COLUMN = 2 # ga:pagePath APP_COLUMN = 2 # ga:pagePath
# at least 1 metric, max 10 metrics
METRICS = %w(
ga:hits
)
SORT = %w()
# ignore the prefetch of the file editor
FILTERS = %W(
ga:hostname==#{HOST};ga:dimension6==200;ga:pagePath!=/pun/sys/file-editor/edit
)
DELIM = "|"
# Fix app token from page path # Fix app token from page path
APP_BASE_URI = "/pun" APP_BASE_URI = "/pun"
APP_TOKEN = { APP_TOKEN = {
...@@ -97,17 +94,6 @@ STDERR.puts "HOST = #{HOST}" ...@@ -97,17 +94,6 @@ STDERR.puts "HOST = #{HOST}"
STDERR.puts "START_DATE = #{START_DATE}" STDERR.puts "START_DATE = #{START_DATE}"
STDERR.puts "END_DATE = #{END_DATE}" STDERR.puts "END_DATE = #{END_DATE}"
require 'googleauth'
require 'google/apis/analytics_v3'
credentials = Google::Auth::ServiceAccountCredentials.make_creds(
json_key_io: File.open(API_KEY, 'r'),
scope: Google::Apis::AnalyticsV3::AUTH_ANALYTICS_READONLY
)
analytics = Google::Apis::AnalyticsV3::AnalyticsService.new
analytics.authorization = credentials
class InvalidToken < StandardError; end class InvalidToken < StandardError; end
def parse_uri(str, **opts) def parse_uri(str, **opts)
...@@ -124,42 +110,86 @@ def parse_token(str, user: "") ...@@ -124,42 +110,86 @@ def parse_token(str, user: "")
raise InvalidToken, "failed to parse app token: #{str}" raise InvalidToken, "failed to parse app token: #{str}"
end end
def gather_data(analytics, file) #polling for done, returns number of events
def splunk_search_state(sid)
#check to see if it is done
done_uri = URI.parse(BASEURL + '/services/search/jobs/' + sid )
done_http = Net::HTTP.new(done_uri.host, done_uri.port)
done_http.use_ssl = true
done_http.verify_mode = OpenSSL::SSL::VERIFY_NONE
done_req = Net::HTTP::Get.new(done_uri.request_uri)
done_req.basic_auth(USER, PASS)
checkVal = 'parsing'
counter = 0
result_count = 0
while (checkVal != 'DONE')
done_res = done_http.request(done_req)
xmldoc = Document.new(done_res.body)
checkVal = XPath.first(xmldoc, "//s:key[@name='dispatchState']").text
progress = XPath.first(xmldoc, "//s:key[@name='doneProgress']").text
result_count = XPath.first(xmldoc, "//s:key[@name='eventCount']").text
if (counter % 5 == 0)
puts "Generating Data... #{progress}/1.0"
end
counter += 1
sleep(1)
end
puts 'Data generated'
return result_count.to_i
end
#returns sid results in relation to count and offset in CSV format; get request
def splunk_search_results(sid, offset)
uri = URI.parse(BASEURL)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
req_uri = URI::HTTP.build(path: "/services/search/jobs/#{sid}/results", query: "output_mode=csv&count=#{COUNT}&offset=#{offset}")
req = Net::HTTP::Get.new(req_uri)
req.basic_auth(USER, PASS)
res = http.request(req)
return res.body
end
#poll, and parse to generate csv
def gather_data(file)
target = open("#{SID_DIR}/#{APP}.app", "r")
sid = target.read
target.close
puts "SID: #{sid}"
request_num = splunk_search_state(sid)
request_num = request_num / COUNT
target = open(file, "w") target = open(file, "w")
STDERR.puts "Sending request..." for i in 0..request_num
start_index = START_STEP results = splunk_search_results(sid, i*COUNT)
loop do csv = CSV.new(results)
results = analytics.get_ga_data( csv.shift
"ga:#{GA_PROFILE}", for row in csv
START_DATE,
END_DATE,
METRICS.join(','),
dimensions: DIMENSIONS.empty? ? nil : DIMENSIONS.join(','),
filters: FILTERS.empty? ? nil : FILTERS.join(','),
sort: SORT.empty? ? nil : SORT.join(','),
start_index: start_index,
max_results: STEP_SIZE
)
STDERR.puts "Outputting #{results.rows.size} items..."
abort "No rows found" if results.rows.nil? || results.rows.empty?
results.rows.each do |row|
begin begin
app = row[APP_COLUMN] uri = URI(row[APP_COLUMN])
app = uri.path
row[APP_COLUMN] = parse_uri(app, user: row[USR_COLUMN]) row[APP_COLUMN] = parse_uri(app, user: row[USR_COLUMN])
row << 1
row << app row << app
target.write "#{row.join(DELIM)}\n" target.write "#{row.join(DELIM)}\n"
rescue InvalidToken => e rescue
$stderr.puts "Warning: #{e.message}" $stderr.puts "Warning: #{row}"
end end
end end
start_index += STEP_SIZE
break unless results.next_link
end end
puts "Done!" puts 'Done!'
ensure ensure
target.close target.close
end end
gather_data(analytics, OUTPUT) gather_data(OUTPUT)
#!/bin/env ruby #!/bin/env ruby
require 'net/http'
require 'uri'
require 'csv'
require 'openssl'
require 'rexml/document'
include REXML
# #
# Set options here # Set options here
# #
OUTPUT = ENV["CLIENT_DATA_FILE"] || "data.client.txt" OUTPUT = ENV["CLIENT_DATA_FILE"] || "data.client.txt"
HOST = ENV["HOST"] || "ondemand.osc.edu" SID_DIR = ENV["SID_DIR"] || "sid"
API_KEY = ENV["GOOGLE_API_KEY"] || "google_api_key.json" APP = ENV["APP"] || "ondemand"
GA_PROFILE = ENV["GOOGLE_PROFILE"] || "133342836" # Open OnDemand (beta) / Authenticated Users HOST = ENV["HOST"] || "ondemand.osc.edu"
START_DATE = ENV["START_DATE"] || "2016-11-29" # Server side analytics code deployed in OSC OnDemand production START_DATE = ENV["START_DATE"] || "2016-11-29"
END_DATE = ENV["END_DATE"] || "today" # google keyword is "today" END_DATE = ENV["END_DATE"] || "today"
START_STEP = ENV["START_STEP"] || 1 # google starts with 1, NOT 0 USER = ENV["USER"] || "admin"
STEP_SIZE = ENV["STEP_SIZE"] || 10000 # max google step is 10,000 PASS = ENV["PASS"] || "pass"
# max 7 dimensions
DIMENSIONS = %w( COUNT = 60000
ga:dimension3 BASEURL = 'https://splunk.osc.edu:8089'
ga:dimension1
ga:browser
ga:browserVersion
ga:operatingSystem
ga:operatingSystemVersion
)
#ga:city
#ga:region
#ga:country
#)
# at least 1 metric, max 10 metrics
METRICS = %w(
ga:hits
)
SORT = %w()
# ignore the prefetch of the file editor
FILTERS = %W(
ga:hostname==#{HOST};ga:dimension6==200;ga:pagePath!=/pun/sys/file-editor/edit
)
DELIM = "|" DELIM = "|"
BROWSER_VERSION= 3
BROWSER_MINOR = 6
BROWSER_PATCH = 7
OS_VERSION = 5
OS_MINOR = 8
OS_PATCH = 9
# #
# Do not modify below # Do not modify below
# #
...@@ -47,48 +41,95 @@ STDERR.puts "HOST = #{HOST}" ...@@ -47,48 +41,95 @@ STDERR.puts "HOST = #{HOST}"
STDERR.puts "START_DATE = #{START_DATE}" STDERR.puts "START_DATE = #{START_DATE}"
STDERR.puts "END_DATE = #{END_DATE}" STDERR.puts "END_DATE = #{END_DATE}"
require 'googleauth' class InvalidToken < StandardError; end
require 'google/apis/analytics_v3'
credentials = Google::Auth::ServiceAccountCredentials.make_creds( #polling for done, returns number of events
json_key_io: File.open(API_KEY, 'r'), def splunk_search_state(sid)
scope: Google::Apis::AnalyticsV3::AUTH_ANALYTICS_READONLY done_uri = URI.parse(BASEURL + '/services/search/jobs/' + sid )
) done_http = Net::HTTP.new(done_uri.host, done_uri.port)
done_http.use_ssl = true
done_http.verify_mode = OpenSSL::SSL::VERIFY_NONE
analytics = Google::Apis::AnalyticsV3::AnalyticsService.new done_req = Net::HTTP::Get.new(done_uri.request_uri)
analytics.authorization = credentials done_req.basic_auth(USER, PASS)
checkVal = 'parsing'
counter = 0
result_count = 0
while (checkVal != 'DONE')
done_res = done_http.request(done_req)
xmldoc = Document.new(done_res.body)
checkVal = XPath.first(xmldoc, "//s:key[@name='dispatchState']").text
progress = XPath.first(xmldoc, "//s:key[@name='doneProgress']").text
result_count = XPath.first(xmldoc, "//s:key[@name='eventCount']").text
if (counter % 5 == 0)
puts "Generating Data... #{progress}/1.0"
end
counter += 1
sleep(1)
end
puts 'Data generated'
return result_count.to_i
end
class InvalidToken < StandardError; end
def gather_data(analytics, file) #returns sid results in relation to count and offset in CSV format; get request
def splunk_search_results(sid, offset)
uri = URI.parse(BASEURL)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
req_uri = URI::HTTP.build(path: "/services/search/jobs/#{sid}/results", query: "output_mode=csv&count=#{COUNT}&offset=#{offset}")
req = Net::HTTP::Get.new(req_uri)
req.basic_auth(USER, PASS)
res = http.request(req)
return res.body
end
#poll, and parse to generate csv
def gather_data(file)
target = open("#{SID_DIR}/#{APP}.client", "r")
sid = target.read
target.close
puts "SID: #{sid}"
request_num = splunk_search_state(sid)
request_num = request_num / COUNT
target = open(file, "w") target = open(file, "w")
STDERR.puts "Sending request..." for i in 0..request_num
start_index = START_STEP results = splunk_search_results(sid, i*COUNT)
loop do csv = CSV.new(results)
results = analytics.get_ga_data( csv.shift
"ga:#{GA_PROFILE}", for row in csv
START_DATE, if (row[BROWSER_MINOR] != 'unknown' && row[BROWSER_MINOR] != nil)
END_DATE, row[BROWSER_VERSION] = row[BROWSER_VERSION] + '.' + row[BROWSER_MINOR]
METRICS.join(','), if (row[BROWSER_PATCH]!= 'unknown' && row[BROWSER_PATCH] != nil)
dimensions: DIMENSIONS.empty? ? nil : DIMENSIONS.join(','), row[BROWSER_VERSION] = row[BROWSER_VERSION] + '.' + row[BROWSER_PATCH]
filters: FILTERS.empty? ? nil : FILTERS.join(','), end
sort: SORT.empty? ? nil : SORT.join(','), end
start_index: start_index,
max_results: STEP_SIZE if(row[OS_MINOR] != 'unknown' && row[OS_MINOR] !=nil)
) row[OS_VERSION] = row[OS_VERSION] + '.' + row[OS_MINOR]
STDERR.puts "Outputting #{results.rows.size} items..." if(row[OS_PATCH] != 'unknown' && row[OS_PATCH] !=nil)
row[OS_VERSION] = row[OS_VERSION] + '.' + row[OS_PATCH]
abort "No rows found" if results.rows.nil? || results.rows.empty? end
end
results.rows.each do |row| row[6] = 1
row.pop
row.pop
row.pop
target.write "#{row.join(DELIM)}\n" target.write "#{row.join(DELIM)}\n"
end end
start_index += STEP_SIZE
break unless results.next_link
end end
STDERR.puts "Done!" puts "Done!"
ensure ensure
target.close target.close
end end
gather_data(analytics, OUTPUT) gather_data(OUTPUT)
#!/bin/env ruby #!/bin/env ruby
require 'net/http'
require 'uri'
require 'csv'
require 'openssl'
require 'rexml/document'
include REXML
# #
# Set options here # Set options here
# #
OUTPUT = ENV["LOCATION_DATA_FILE"] || "data.location.txt" OUTPUT = ENV["LOCATION_DATA_FILE"] || "data.location.txt"
SID_DIR = ENV["SID_DIR"] || "sid"
APP = ENV["APP"] || "ondemand"
HOST = ENV["HOST"] || "ondemand.osc.edu" HOST = ENV["HOST"] || "ondemand.osc.edu"
API_KEY = ENV["GOOGLE_API_KEY"] || "google_api_key.json" START_DATE = ENV["START_DATE"] || "2016-11-29"
GA_PROFILE = ENV["GOOGLE_PROFILE"] || "133342836" # Open OnDemand (beta) / Authenticated Users END_DATE = ENV["END_DATE"] || "today"
START_DATE = ENV["START_DATE"] || "2016-11-29" # Server side analytics code deployed in OSC OnDemand production USER = ENV["USER"] || "admin"
END_DATE = ENV["END_DATE"] || "today" # google keyword is "today" PASS = ENV["PASS"] || "pass"
START_STEP = ENV["START_STEP"] || 1 # google starts with 1, NOT 0
STEP_SIZE = ENV["STEP_SIZE"] || 10000 # max google step is 10,000 COUNT = 60000
# max 7 dimensions BASEURL = 'https://splunk.osc.edu:8089'
DIMENSIONS = %w(
ga:dimension3
ga:dimension1
ga:city
ga:region
ga:country
)
# at least 1 metric, max 10 metrics
METRICS = %w(
ga:hits
)
SORT = %w()
# ignore the prefetch of the file editor
FILTERS = %W(
ga:hostname==#{HOST};ga:dimension6==200;ga:pagePath!=/pun/sys/file-editor/edit
)
DELIM = "|" DELIM = "|"
# #
...@@ -42,48 +34,81 @@ STDERR.puts "HOST = #{HOST}" ...@@ -42,48 +34,81 @@ STDERR.puts "HOST = #{HOST}"
STDERR.puts "START_DATE = #{START_DATE}" STDERR.puts "START_DATE = #{START_DATE}"
STDERR.puts "END_DATE = #{END_DATE}" STDERR.puts "END_DATE = #{END_DATE}"
require 'googleauth' class InvalidToken < StandardError; end
require 'google/apis/analytics_v3'
credentials = Google::Auth::ServiceAccountCredentials.make_creds( #polling for done, returns number of events
json_key_io: File.open(API_KEY, 'r'), def splunk_search_state(sid)
scope: Google::Apis::AnalyticsV3::AUTH_ANALYTICS_READONLY done_uri = URI.parse(BASEURL + '/services/search/jobs/' + sid )
) done_http = Net::HTTP.new(done_uri.host, done_uri.port)
done_http.use_ssl = true
done_http.verify_mode = OpenSSL::SSL::VERIFY_NONE
analytics = Google::Apis::AnalyticsV3::AnalyticsService.new done_req = Net::HTTP::Get.new(done_uri.request_uri)
analytics.authorization = credentials done_req.basic_auth(USER, PASS)
class InvalidToken < StandardError; end checkVal = 'parsing'
counter = 0
result_count = 0
while (checkVal != 'DONE')
done_res = done_http.request(done_req)
xmldoc = Document.new(done_res.body)
checkVal = XPath.first(xmldoc, "//s:key[@name='dispatchState']").text
progress = XPath.first(xmldoc, "//s:key[@name='doneProgress']").text
result_count = XPath.first(xmldoc, "//s:key[@name='eventCount']").text
if (counter % 5 == 0)
puts "Generating Data... #{progress}/1.0"
end
counter += 1
sleep(1)
end
puts 'Data generated'
return result_count.to_i
end
#returns sid results in relation to count and offset in CSV format; get request
def splunk_search_results(sid, offset)
uri = URI.parse(BASEURL)
http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = true
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
req_uri = URI::HTTP.build(path: "/services/search/jobs/#{sid}/results", query: "output_mode=csv&count=#{COUNT}&offset=#{offset}")
req = Net::HTTP::Get.new(req_uri)
req.basic_auth(USER, PASS)
res = http.request(req)
return res.body
end
#poll, and parse to generate csv
def gather_data(file)
target = open("#{SID_DIR}/#{APP}.location", "r")
sid = target.read
target.close
puts "SID: #{sid}"
request_num = splunk_search_state(sid)
request_num = request_num / COUNT
def gather_data(analytics, file)
target = open(file, "w") target = open(file, "w")
puts "Sending request..." for i in 0..request_num
start_index = START_STEP results = splunk_search_results(sid, i*COUNT)
loop do csv = CSV.new(results)
results = analytics.get_ga_data( csv.shift
"ga:#{GA_PROFILE}", for row in csv
START_DATE, row << 1
END_DATE,
METRICS.join(','),
dimensions: DIMENSIONS.empty? ? nil : DIMENSIONS.join(','),
filters: FILTERS.empty? ? nil : FILTERS.join(','),
sort: SORT.empty? ? nil : SORT.join(','),
start_index: start_index,
max_results: STEP_SIZE
)
puts "Outputting #{results.rows.size} items..."
abort "No rows found" if results.rows.nil? || results.rows.empty?
results.rows.each do |row|
target.write "#{row.join(DELIM)}\n" target.write "#{row.join(DELIM)}\n"
end end
start_index += STEP_SIZE
break unless results.next_link
end end
puts "Done!" puts "Done!"
ensure ensure
target.close target.close
end end
gather_data(analytics, OUTPUT)
gather_data(OUTPUT)
#!/bin/env ruby
require 'net/http'
require 'uri'
require 'openssl'
require 'rexml/document'
include REXML
HOST = ENV["HOST"] || "ondemand.osc.edu"
APP = ENV["APP"] || "ondemand"
USER = ENV["USER"] || "admin"
PASS = ENV["PASS"] || "pass"
SID_DIR = ENV["SID_DIR"] || "sid"
BASEURL = 'https://splunk.osc.edu:8089'
APP_QUERY = "search source=/var/log/httpd24/#{HOST}_access_ssl.log earliest=-1mon@mon latest=@mon | where user!=\"-\" | where http_referer!=\"-\" | search *#{HOST}/pun/ | eval session=strftime(_time, \"\%Y-\%m-\%d\")+user+http_referer| dedup session | eval mytime=strftime(_time, \"\%Y-\%m-\%dT\%H:\%M:\%SZ\")| table mytime, user, http_referer"
CLIENT_QUERY = "search source=/var/log/httpd24/#{HOST}_access_ssl.log earliest=-1mon@mon latest=@mon | where user!=\"-\" | lookup user_agents http_user_agent | eval session=strftime(_time, \"\%Y-\%m-\%d\")+user+ua_family | dedup session | eval mytime=strftime(_time, \"\%Y-\%m-\%dT\%H:\%M:\%SZ\") | table mytime, user, ua_family, ua_major, ua_os_family, ua_os_major, ua_minor, ua_patch, ua_os_minor, ua_os_patch"
LOCATION_QUERY = "search source=/var/log/httpd24/#{HOST}_access_ssl.log earliest=-1mon@mon latest=@mon | where user!=\"-\" | iplocation src_ip | eval session=strftime(_time, \"\%Y-\%m-\%d\")+user+City| dedup session |eval mytime=strftime(_time, \"\%Y-\%m-\%dT\%H:\%M:\%SZ\") | table mytime, user, City, Region, Country"
#send search request
def splunk_search(search_params, name)
#setup search params for splunk
search_uri = URI.parse(BASEURL + '/services/search/jobs')
search_http = Net::HTTP.new(search_uri.host, search_uri.port)
search_http.use_ssl = true
search_http.verify_mode = OpenSSL::SSL::VERIFY_NONE
#set a request form and data
search_req = Net::HTTP::Post.new(search_uri.request_uri)
search_req.basic_auth(USER, PASS)
search_req.set_form_data({'search'=> search_params})
#send a post request to splunk
search_res = search_http.request(search_req)
#parse xml into doc and get searchID
xmldoc = Document.new(search_res.body)
sid = XPath.first(xmldoc, '//sid').text
puts "#{HOST} #{name} search ID: " + sid
target = open("#{SID_DIR}/#{APP}.#{name}", "w")
target.write sid
target.close
end
splunk_search(APP_QUERY, "app")
splunk_search(CLIENT_QUERY, "client")
splunk_search(LOCATION_QUERY, "location")
...@@ -3,6 +3,19 @@ ...@@ -3,6 +3,19 @@
DIR=$(dirname $(readlink -f "$0")) DIR=$(dirname $(readlink -f "$0"))
OUTPUT_DIR=$DIR/output OUTPUT_DIR=$DIR/output
read -p 'User: ' user
read -p 'Pass: ' pass
export USER=$user
export PASS=$pass
export SID_DIR=$DIR/sid
mkdir $SID_DIR
function start_search(){
export HOST=$1
export APP=$2
$DIR/bin/start_searches
}
function generate_monthly_report(){ function generate_monthly_report(){
export HOST=$1 export HOST=$1
export APP=$2 export APP=$2
...@@ -31,8 +44,18 @@ function cleanup(){ ...@@ -31,8 +44,18 @@ function cleanup(){
rm -rf $OUTPUT_DIR rm -rf $OUTPUT_DIR
} }
function cleanup2(){
rm -rf $SID_DIR
}
cleanup cleanup
start_search 'apps.awesim.org' 'awesim'
start_search 'apps.totalsim.us' 'totalsim'
start_search 'ondemand.osc.edu' 'ondemand'
start_search 'stat.osc.edu' 'stats'
generate_monthly_report 'apps.awesim.org' 'awesim' generate_monthly_report 'apps.awesim.org' 'awesim'
generate_monthly_report 'apps.totalsim.us' 'totalsim' generate_monthly_report 'apps.totalsim.us' 'totalsim'
generate_monthly_report 'ondemand.osc.edu' 'ondemand' generate_monthly_report 'ondemand.osc.edu' 'ondemand'
generate_monthly_report 'stat.osc.edu' 'stats' generate_monthly_report 'stat.osc.edu' 'stats'
cleanup2
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment