CRAWLER_PATTERN = /archiver|slurp|bot|crawler|jeeves|spider|\.{6}/i total_lines = File.readlines(ARGV[0]).to_a.length count = 0 log = {} File.readlines(ARGV[0]).each do |line| #66.249.72.211 - - [20/Feb/2012:00:00:00 +0100] "GET /en/gcc/compare/ean13/7680560770211/currency/EUR HTTP/1.1" 200 14826 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" x = line.split(/-/) ip = x[0].strip time = x[2].match(/\[(.*)\ \+0100\]/)[1].strip agent = line.split(/"/) agent.pop agent = agent.pop #p "ip = #{ip}, time = #{time}, agent = #{agent}" log[ip] ||= [] log[ip] << [time, agent, (line =~ CRAWLER_PATTERN)] if line =~ CRAWLER_PATTERN count += 1 end end # example Google bot time_window = 2 # seconds limits = 10 # requests list = [] #log['66.249.72.231'].each do |access| log.each do |ip, requests| puts requests.each do |access| x = access.first.split(/:/) time = x[1].to_i * 3600 + x[2].to_i * 60 + x[3].to_i if list.length == 0 list << access else f = list.first.first.split(/:/) first_time = f[1].to_i * 3600 + f[2].to_i * 60 + f[3].to_i if (first_time - time).abs < time_window list << access else list.delete_if do |acc| a = acc.first.split(/:/) acc_time = a[1].to_i * 3600 + a[2].to_i * 60 + f[3].to_i (acc_time - time).abs >= time_window end list << access end end if list.length > limits c = if access[1] =~ CRAWLER_PATTERN 'C' else '' end print c, " ", ip, "\t", access, "\n" end end end print "total lines: #{total_lines}, crawler_pattern: #{count} (#{"%.2f" % (count.to_f/total_lines*100)} %), access IPs: #{log.length}\n"