用Python分析Apache等Web日志
用Python分析Apache等Web日志
1 分析日志的python框架awk.py
- #
- # Custom awk.py module
- #
- class controller:
- def __init__(self, f):
- self.m_file = f
- self.m_handlers = []
- def subscribe(self, o):
- self.m_handlers.append(o)
- def run(self):
- for o in self.m_handlers:
- o.begin()
- s = self.m_file.readline()
- while s != "":
- for o in self.m_handlers:
- o.process_line(s)
- s = self.m_file.readline()
- for o in self.m_handlers:
- o.end()
- def print_results(self):
- print "Results:"
- for o in self.m_handlers:
- print "------------------------------------------------------"
- print o.description()
- print "------------------------------------------------------"
- print o.result()
- # Standard sys module
- import sys
- # Custom awk.py module
- import awk
- class count_lines:
- def begin(self):
- self.m_count = 0
- def process_line(self, s):
- self.m_count += 1
- def end(self):
- pass
- def description(self):
- return "# of lines in the file"
- def result(self):
- return self.m_count
- #
- # Step 1: Create the Awk controller
- #
- ac = awk.controller(sys.stdin)
- #
- # Step 2: Subscribe the handler
- #
- ac.subscribe(count_lines())
- #
- # Step 3: Run
- #
- ac.run()
- #
- # Step 4: Print the results
- #
- ac.print_results()
使用方法是shell中执行
# cat apachelog.log|python count_lines.py
统计浏览次数超过n次的访问者 visitors.py
How many people have returned to the site more than N times?
- import re;
- import sys
- imort awk
- class return_visitors:
- def __init__(self, n):
- self.m_n = n;
- self.m_ip_days = {};
- def begin(self):
- pass;
- def process_line(self, s):
- try:
- array = s.split();
- ip = array[0];
- day = array[3][1:7];
- if self.m_ip_days.has_key(ip):
- if day not in self.m_ip_days[ip]:
- self.m_ip_days[ip].append(day);
- else:
- self.m_ip_days[ip] = [];
- self.m_ip_days[ip].append(day);
- except IndexError:
- pass;
- def end(self):
- ips = self.m_ip_days.keys();
- count = 0;
- for ip in ips:
- if len(self.m_ip_days[ip]) > self.m_n:
- count += 1;
- self.m_count = count;
- def description(self):
- return "# of IP addresses that visited more than %s days" % self.m_n;
- def result(self):
- return self.m_count;
- ac = awk.controller(sys.stdin)
- ac.subscribe(return_visitors(2))
- ac.run()
- ac.print_results()
# cat apachelog.log|python visitors.py
按照域名统计访问量domain.py- import re;
- import sys
- imort awk
- class referring_domains:
- def __init__(self):
- self.m_domains = {};
- def begin(self):
- pass;
- def process_line(self, line):
- try:
- array = line.split();
- referrer = array[10];
- m = re.search('//[a-zA-Z0-9\-\.]*\.[a-zA-z]{2,3}/',
- referrer);
- lenlength = len(m.group(0));
- domain = m.group(0)[2:length-1];
- if self.m_domains.has_key(domain):
- self.m_domains[domain] += 1;
- else:
- self.m_domains[domain] = 1;
- except AttributeError:
- pass;
- except IndexError:
- pass;
- def end(self):
- pass;
- def description(self):
- return "Referring domains";
- def sort(self, key1, key2):
- if self.m_domains[key1] > self.m_domains[key2]:
- return -1;
- elif self.m_domains[key1] == self.m_domains[key2]:
- return 0;
- else:
- return 1;
- def result(self):
- s = "";
- keys = self.m_domains.keys();
- keys.sort(self.sort);
- for domain in keys:
- s += domain;
- s += " ";
- s += str(self.m_domains[domain]);
- s += "\n";
- s += "\n\n";
- return s;
- ac = awk.controller(sys.stdin)
- ac.subscribe(referring_domains())
- ac.run()
- ac.print_results()
评论暂时关闭