TOC
### CHAPTER 4 Iterators
### 4.1 introduction
# @lines = open('filename'); # alternate universe interface
lines = open("filename") # a less alternate universe interface
# open(FILEHANDLE, 'filename');
# while () {
# last if /Plutonium/;
# }
# close FILEHANDLE;
# # do something with $_;
fh = open("filename")
for line in fh:
if "Plutonium" in line:
break
fh.close()
# do something with line
# # alternate universe interface
# @lines = open('filename');
# for (@lines) {
# last if /Plutonium/;
# }
# # do something with $_;
lines = open("filename").readlines()
for line in lines:
if "Plutonium" in line:
break
fh.close()
# do something with line
# @lines = open("yes |"); # alternate universe interface
lines = os.popen("yes").readlines() # alternate universes interface
# sub parse_section {
# my $fh = shift;
# my $title = parse_section_title($fh);
# my %variables = parse_variables($fh);
# return [$title, \%variables];
# }
def parse_section(fh):
title = parse_section_title(fh)
variables = parse_variables(fh)
return title, variables
# sub parse_section {
# my @lines = @_;
# my $title = parse_section_title(@lines);
# my %variables = parse_variables(@lines);
# return [$title, \%variables];
# }
# In the python case we *could* easily
# pop values from the list (but really you
# could in perl as well)
def parse_section(lines):
title = parse_section_title(lines)
variables = parse_variables(lines)
return title, variables
# opendir D, "/tmp";
# @entries = readdir D;
# In python we get a list instead of a iterator
entries = os.listdir("/tmp")
# opendir D, "/tmp";
# while (my $entry = readdir D) {
# # Do something with $entry
# }
# Python doesn't have "scalar" mode type behavior changes
for entry in os.listdir("/tmp"):
# Do something with $entry
# while (my $file = glob("/tmp/*.[ch]")) {
# # Do something with $file
# }
# glob in python is also not an iterator
for _file in glob.glob("/tmp/*.[ch]"):
# Do something with $file
# while (my $key = each %hash) {
# # Do something with $key
# }
# depending on version of python
# hash may automagically be an iterator
# (maybe all versions...)
for key in hash.iterkeys():
# Do something with key
# @matches = ("12:34:56" =~ m/(\d+)/g);
matches = re.findall("(\d+)", "12:34:56")
# while ("12:34:56" = ̃ m/(\d+)/g) {
# # do something with $1
# }
for m in re.finditer("(\d+)", "12:34:56"):
# do something with m (where m is a "match" object)
### 4.2 Homemade Iterators
# sub dir_walk {
# my ($dir, $filefunc, $dirfunc, $user) = @_;
# my $iterator = make_iterator($dir);
# while (my $filename = NEXTVAL($iterator)) {
# if (-f $filename) { $filefunc->($filename, $user) }
# else { $dirfunc->($filename, $user) }
# }
# }
# In python os.walk returns an iterator as is
def dir_walk(dir, filefunc, dirfunc, user):
iterator = make_iterator(dir)
for filename in iterator:
if os.path.isfile(filename):
filefunc(filename, user)
else:
dirfunc(filename, user)
# sub upto {
# my ($m, $n) = @_;
# return sub {
# return $m <= $n ? $m++ : undef;
# };
# }
# my $it = upto(3, 5);
def upto(m,n):
_i = [m]
def foo():
val = _i[0]
_i[0] += 1
if val > n:
return None
return val
return foo
it = upto(3,5)
## of course in python it's more natural to do the following:
# def upto(m,n):
# for x in range(m,n+1):
# yield x
# my $nextval = $it->();
nextval = it()
# while (defined(my $val = $it->())) {
# # now do something with $val, such as:
# print "$val\n";
# }
# this doesn't translate in a pretty way to python
# since we can't have statements in a while
# context
val = it()
while val != None:
# now do something with val, such as:
print val
val = it()
# but of course we'd just use:
for val in it:
print val
# for my $val (1 .. 10000000) {
# # now do something with $val
# }
for val in range(1, 10000000):
# now do something with val
# package Iterator_Utils;
# use base Exporter;
# @EXPORT_OK = qw(NEXTVAL Iterator
# append imap igrep
# iterate_function filehandle_iterator list_iterator);
# %EXPORT_TAGS = ('all' => \@EXPORT_OK);
# sub NEXTVAL { $_[0]->() }
# my $nextval = NEXTVAL($it);
# while (defined(my $val = NEXTVAL($it))) {
# # now do something with $val
# }
# No need to do these machinations since this is already built into
# python except we'd do this with "for"
for val in it:
# not do something with val
# sub upto {
# my ($m, $n) = @_;
# return Iterator {
# return $m <= $n ? $m++ : undef;
# };
# }
# sub Iterator (&) { return $_[0] }
# in python we just do this with a yield
def upto(m, n):
i = m
while i <= n:
yield i
i += 1
# # iterator version
# sub dir_walk {
# my @queue = shift;
# return Iterator {
# while (@queue) {
# my $file = shift @queue;
# if (-d $file) {
# opendir my $dh, $file or next;
# my @newfiles = grep {$_ ne "." && $_ ne ".."} readdir $dh;
# push @queue, map "$file/$_", @newfiles;
# }
# return $file;
# } else {
# return;
# }
# };
# }
def dir_walk(root):
queue = [root]
while queue:
_file = queue.pop(0)
if os.path.isdir(_file):
for newfile in os.listdir(_file):
queue.append(os.path.join(_file, newfile))
yield _file
# sub dir_walk {
# my ($top, $code) = @_;
# my $DIR;
# $code->($top);
# if (-d $top) {
# my $file;
# unless (opendir $DIR, $top) {
# warn "Couldn’t open directory $top: $!; skipping.\n";
# return;
# }
# while ($file = readdir $DIR) {
# next if $file eq '.'|| $file eq '..'
# dir_walk("$top/$file", $code);
# }
# }
# }
def dir_walk(top, code):
code(top)
if os.path.isdir(top):
try:
for _file in os.listdir(top):
dir_walk(os.path.join(top,_file), code)
except StandardError, why:
print "Couldn't open directory %s: %s" % (top, why)
return
### 4.3 Examples
# sub interesting_files {
# my $is_interesting = shift;
# my @queue = @_;
# return Iterator {
# while (@queue) {
# my $file = shift @queue;
# if (-d $file) {
# opendir my $dh, $file or next;
# my @newfiles = grep {$_ ne "." && $_ ne ".."} readdir $dh;
# push @queue, map "$file/$_", @newfiles;
# }
# return $file if $is_interesting->($file);
# }
# return;
# };
# }
def interesting_files(is_interesting, *top_dirs):
queue = list(top_dirs)
while queue:
_file = queue.pop(0)
if os.path.isdir(_file):
for newfile in os.listdir(_file):
queue.append(os.path.join(_file, newfile))
if is_interesting(_file):
yield _file
# # Files are deemed to be interesting if they mention octopuses
# sub contains_octopuses {
# my $file = shift;
# return unless -T $file && open my($fh), "<", $file;
# while (<$fh>) {
# return 1 if /octopus/i;
# }
# return;
# }
# my $octopus_file =
# interesting_files(\&contains_octopuses, 'uploads', 'downloads');
# while ($file = NEXTVAL($octopus_file)) {
# # do something with the file
# }
# if (NEXTVAL($next_octopus)) {
# # yes, there is an interesting file
# } else {
# # no, there isn’t.
# }
# undef $next_octopus;
def contains_octopuses(_file):
if not os.path.isfile(_file):
return False
for line in file(_file):
if "octopus" in line:
return True
return False
octopus_file = interesting_files(contains_octopuses, "uploads", "downloads")
for _octopus_file in octopus_file:
# do something with the file
try:
next_octopus.next()
# yes, there is an interesting file
except StopIteration:
# no there isn't
del next_octopus
# sub permute {
# my @items = @{ $_[0] };
# my @perms = @{ $_[1] };
# unless (@items) {
# print "@perms\n";
# } else {
# my(@newitems,@newperms,$i);
# foreach $i (0 .. $#items) {
# @newitems = @items;
# @newperms = @perms;
# unshift(@newperms, splice(@newitems, $i, 1));
# permute([@newitems], [@newperms]);
# }
# }
# }
# # sample call:
# permute([qw(red yellow blue green)], []);
# I suspect I don't have this quite right
# it produces the permuations but doesn't
# have the problem of waiting for the end to
# start showing the permutations
def permute(items, perms):
if not items:
print perms
else:
for i in range(len(items)):
newitems = items[:]
newitem = newitems.pop(i)
newperms = [newperm+[newitem] for newperm in perms] or [[newitem]]
permute(newitems, newperms)
# sample call
permute(["red", "yello", "blue", "green"], [])
# my $it = permute('A'..'D');
# while (my @p = NEXTVAL($it)) {
# print "@p\n";
# }
it = permute(["A","B","C","D"])
for p in it:
print p
# sub permute {
# my @items = @_;
# my @pattern = (0) x @items;
# return Iterator {
# return unless @pattern;
# my @result = pattern_to_permutation(\@pattern, \@items);
# @pattern = increment_pattern(@pattern);
# return @result;
# };
# }
def permute(items):
pattern = [0] * len(items)
while pattern:
result = pattern_to_permutation(pattern, items)
pattern = increment_pattern(pattern)
yield result
# sub pattern_to_permutation {
# my $pattern = shift;
# my @items = @{shift()};
# my @r;
# for (@$pattern) {
# push @r, splice(@items, $_, 1);
# }
# @r;
# }
def pattern_to_permutation(pattern, items):
items = items[:]
r = []
for _x in pattern:
r.append(items.pop(_x))
return r
# sub increment_odometer {
# my @odometer = @_;
# my $wheel = $#odometer; # start at rightmost wheel
# until ($odometer[$wheel] < 9 || $wheel < 0) {
# $odometer[$wheel] = 0;
# $wheel--; # next wheel to the left
# }
# if ($wheel < 0) {
# return; # fell off the left end; no more sequences
# } else {
# $odometer[$wheel]++; # this wheel now turns one notch
# return @odometer;
# }
# }
def increment_odometer(odometer):
wheel = len(odometer) - 1
while not (odometer[wheel] < 9 or wheel < 0):
odometer[wheel] = 0
wheel -= 1
if wheel < 0:
return
else:
odometer[wheel] += 1
return odometer
# sub increment_pattern {
# my @odometer = @_;
# my $wheel = $#odometer; # start at rightmost wheel
# until ($odometer[$wheel] < $#odometer-$wheel || $wheel < 0) {
# $odometer[$wheel] = 0;
# $wheel--; # next wheel to the left
# }
# if ($wheel < 0) {
# return; # fell off the left end; no more sequences
# } else {
# $odometer[$wheel]++; # this wheel now turns one notch
# return @odometer;
# }
# }
def increment_pattern(odometer):
wheel = len(odometer) - 1
while not (odometer[wheel] < (len(odometer)-1-wheel) or wheel < 0):
odometer[wheel] = 0
wheel -= 1
if wheel < 0:
return
else:
odometer[wheel] += 1
return odometer
# sub n_to_pat {
# my @odometer;
# my ($n, $length) = @_;
# for my $i (1 .. $length) {
# unshift @odometer, $n % $i;
# $n = int($n/$i);
# }
# return $n ? () : @odometer;
# }
def n_to_pat(n, length):
odometer = []
for i in range(1, length+1):
odometer.insert(0, n % i)
n = n / i
return not n and odometer or []
# sub permute {
# my @items = @_;
# my $n = 0;
# return Iterator {
# my @pattern = n_to_pat($n, scalar(@items));
# my @result = pattern_to_permutation(\@pattern, \@items);
# $n++;
# return @result;
# };
# }
def permute(items):
n = 0
while 1:
pattern = n_to_pat(n, len(items))
if not pattern:
break
result = pattern_to_permutation(pattern, items)
yield result
n += 1
# sub iterate_function {
# my $n = 0;
# my $f = shift;
# return Iterator {
# return $f->($n++);
# };
# }
def iterate_function(f):
n = 0
while 1:
yield f(n)
n += 1
# sub permute {
# my @items = @_;
# my $n = 0;
# return Iterator {
# $n++, return @items if $n==0;
# my $i;
# my $p = $n;
# for ($i=1; $i<=@items && $p%$i==0; $i++) {
# $p /= $i;
# }
# my $d = $p % $i;
# my $j = @items - $i;
# return if $j < 0;
# @items[$j+1..$#items] = reverse @items[$j+1..$#items];
# @items[$j,$j+$d] = @items[$j+$d,$j];
# $n++;
# return @items;
# };
# }
def permute(_items):
n = 0
items = _items[:]
if n == 0:
yield items
n += 1
while 1:
# make a copy so list(permute(my_list)) returns n copies of same item
# otherwise can remove
items = items[:]
i = 1
p = n
while i <= len(items)+1 and p % i == 0:
p /= i
i += 1
d = p % i
j = len(items) - i
if j < 0:
return
items[j+1:len(items)] = reversed(items[j+1:len(items)])
x,y = items[j+d], items[j]
items[j] = x
items[j+d] = y
n += 1
yield items
# sub make_genes {
# my $pat = shift;
# my @tokens = split /[()]/, $pat;
# for (my $i = 1; $i < @tokens; $i += 2) {
# $tokens[$i] = [0, split(//, $tokens[$i])];
# }
# my $FINISHED = 0;
# return Iterator {
# return if $FINISHED;
# my $finished_incrementing = 0;
# my $result = "";
# for my $token (@tokens) {
# if (ref $token eq "") { # plain string
# $result .= $token;
# } else { # wildcard
# my ($n, @c) = @$token;
# $result .= $c[$n];
# unless ($finished_incrementing) {
# if ($n == $#c) { $token->[0] = 0 }
# else { $token->[0]++; $finished_incrementing = 1 }
# }
# }
# }
# $FINISHED = 1 unless $finished_incrementing;
# return $result;
# }
# }
def make_genes(pat):
tokens = re.split("[()]",pat)
for i in range(len(tokens))[1::2]:
tokens[i] = [0] + list(tokens[i])
FINISHED = False
while not FINISHED:
finished_incrementing = False
result = ""
for token in tokens:
if token.__class__ is str:
result += token
else:
n, c = token[0], token[1:]
result += c[n]
if not finished_incrementing:
if n == len(c) - 1:
token[0] = 0
else:
token[0] += 1
finished_incrementing = True
if not finished_incrementing:
FINISHED = True
yield result
# %n_expand = qw(N ACGT
# B CGT D AGT H ACT V ACG
# K GT M AC R AG S CG W AT Y CT);
# sub make_dna_sequences {
# my $pat = shift;
# for my $abbrev (keys %n_expand) {
# $pat =~ s/$abbrev/($n_expand{$abbrev})/g;
# }
# return make_genes($pat);
# }
n_expand = {"N" : "ACGT",
"B" : "CGT", "D" : "AGT", "H" : "ACT", "V" : "ACG",
"K" : "GT", "M" : "AC", "R" : "AG", "S" : "CG", "W" : "AT", "Y" : "CT"}
def make_dna_sequences(pat):
for abbrev in n_expand:
pat = re.sub(abbrev, n_expand[abbrev], pat)
return make_genes(pat)
# sub filehandle_iterator {
# my $fh = shift;
# return Iterator { <$fh> };
# }
# my $it = filehandle_iterator(*STDIN);
# while (defined(my $line = NEXTVAL($it))) {
# # do something with $line
# }
### python already does this by default
for line in file("foo"):
# do something with line
# LASTNAME:FIRSTNAME:CITY:STATE:OWES
# Adler:David:New York:NY:157.00
# Ashton:Elaine:Boston:MA:0.00
# Dominus:Mark:Philadelphia:PA:0.00
# Orwant:Jon:Cambridge:MA:26.30
# Schwern:Michael:New York:NY:149658.23
# Wall:Larry:Mountain View:CA:-372.14
# package FlatDB;
# my $FIELDSEP = qr/:/;
# sub new {
# my $class = shift;
# my $file = shift;
# open my $fh, "<", $file or return;
# chomp(my $schema = <$fh>);
# my @field = split $FIELDSEP, $schema;
# my %fieldnum = map { uc $field[$_] => $_ } (0..$#field);
# bless { FH => $fh, FIELDS => \@field, FIELDNUM => \%fieldnum,
# FIELDSEP => $FIELDSEP } => $class;
# }
class FlatDB(object):
FIELDSEP = ":"
def __init__(self, _file):
self._file = _file
self.fh = file(self._file)
self.schema = self.fh.readline().strip()
self.field = self.schema.split(FlatDB.FIELDSEP)
self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))
# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# use Fcntl ':seek';
# sub query {
# my $self = shift;
# my ($field, $value) = @_;
# my $fieldnum = $self->{FIELDNUM}{uc $field};
# return unless defined $fieldnum;
# my $fh = $self->{FH};
# seek $fh, 0, SEEK_SET;
# <$fh>; # discard schema line
# return Iterator {
# local $_;
# while (<$fh>) {
# chomp;
# my @fields = split $self->{FIELDSEP}, $_, -1;
# my $fieldval = $fields[$fieldnum];
# return $_ if $fieldval eq $value;
# }
# return;
# };
# }
def query(self, field, value):
fieldnum = self.fieldnum.get(field.upper())
if fieldnum == None:
return
fh = self.fh
fh.seek(0)
fh.readline() # discard schema line
for line in fh:
fields = line.split(FlatDB.FIELDSEP)
fieldval = fields[fieldnum]
if fieldval == value:
yield line.strip()
# use FlatDB;
# my $dbh = FlatDB->new('db.txt') or die $!;
# my $q = $dbh->query('STATE', 'NY');
# while (my $rec = NEXTVAL($q)) {
# print $rec;
# }
dbh = FlatDB("db.txt")
q = dbh.query("STATE", "NY")
for rec in q:
print rec
# my $q = $dbh->callbackquery(sub { my %F=@_; $F{OWES} > 10 });
# my $q = $dbh->callbackquery(sub { my %F=@_; $F{FIRSTNAME} =~ /ˆM/ });
# use Fcntl ':seek';
# sub callbackquery {
# my $self = shift;
# my $is_interesting = shift;
# my $fh = $self->{FH};
# seek $fh, 0, SEEK_SET;
# <$fh>; # discard header line
# return Iterator {
# local $_;
# while (<$fh>) {
# chomp;
# my %F;
# my @fieldnames = @{$self->{FIELDS}};
# my @fields = split $self->{FIELDSEP};
# for (0 .. $#fieldnames) {
# $F{$fieldnames[$_]} = $fields[$_];
# }
# return $_ if $is_interesting->(%F);
# }
# return;
# }
# }
q = dbh.callbackquery(lambda F: F["OWES"] > 10)
q = dbh.callbackquery(lambda F: F["FIRSTNAME"].startswith("M") )
def callbackquery(self, is_interesting):
fh = self.fh
fh.seek(0)
fh.readline() # discard schema line
for line in fh:
line = line.strip()
fieldnames = self.field
fields = line.split(FlatDB.FIELDSEP)
F = dict(zip(fieldnames, fields))
if is_interesting(F):
yield line
# use FlatDB;
# my $dbh = FlatDB->new('db.txt') or die $!;
# my $q1 = $dbh->query('STATE', 'MA');
# my $q2 = $dbh->query('STATE', 'NY');
# for (1..2) {
# print NEXTVAL($q1), NEXTVAL($q2);
# }
dbh = FlatDB("db.txt")
q1 = dbh.query("STATE","MA")
q2 = dbh.query("STATE","NY")
for x in range(1,3):
print q1.next(), q2.next()
# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# use Fcntl ':seek';
# sub query {
# my $self = shift;
# my ($field, $value) = @_;
# my $fieldnum = $self->{FIELDNUM}{uc $field};
# return unless defined $fieldnum;
# my $fh = $self->{FH};
# seek $fh, 0, SEEK_SET;
# <$fh>; # discard header line
# my $position = tell $fh;
# return Iterator {
# local $_;
# seek $fh, $position, SEEK_SET;
# while (<$fh>) {
# chomp;
# $position = tell $fh;
# my @fields = split $self->{FIELDSEP};
# my $fieldval = $fields[$fieldnum];
# return $_ if $fieldval eq $value;
# }
# return;
# };
# }
# # callbackquery with bug fix
# use Fcntl ':seek';
# sub callbackquery {
# my $self = shift;
# my $is_interesting = shift;
# my $fh = $self->{FH};
# seek $fh, 0, SEEK_SET;
# <$fh>; # discard header line
# my $position = tell $fh;
# return Iterator {
# local $_;
# seek $fh, $position, SEEK_SET;
# while (<$fh>) {
# $position = tell $fh;
# my %F;
# my @fieldnames = @{$self->{FIELDS}};
# my @fields = split $self->{FIELDSEP};
# for (0 .. $#fieldnames) {
# $F{$fieldnames[$_]} = $fields[$_];
# }
# return $_ if $is_interesting->(%F);
# }
# return;
# };
# }
# 1;
class FlatDB(object):
FIELDSEP = ":"
def __init__(self, _file):
self._file = _file
self.fh = file(self._file)
self.schema = self.fh.readline().strip()
self.field = self.schema.split(FlatDB.FIELDSEP)
self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))
def query(self, field, value):
fieldnum = self.fieldnum.get(field.upper())
if fieldnum == None:
return
fh = self.fh
fh.seek(0)
fh.readline() # discard schema line
while 1:
line = fh.readline()
if not line:
break
position = fh.tell()
fields = line.split(FlatDB.FIELDSEP)
fieldval = fields[fieldnum]
if fieldval == value:
yield line.strip()
fh.seek(position)
def callbackquery(self, is_interesting):
fh = self.fh
fh.seek(0)
fh.readline() # discard schema line
while 1:
line = fh.readline()
if not line:
break
position = fh.tell()
line = line.strip()
fieldnames = self.field
fields = line.split(FlatDB.FIELDSEP)
F = dict(zip(fieldnames, fields))
if is_interesting(F):
yield line
fh.seek(position)
# package FlatDB::Iterator;
# my $FIELDSEP = qr/\s+/;
# sub new {
# my $class = shift;
# my $it = shift;
# my @field = @_;
# my %fieldnum = map { uc $field[$_] => $_ } (0..$#field);
# bless { FH => $it, FIELDS => \@field, FIELDNUM => \%fieldnum,
class IterFlatDB(object):
FIELDSEP = "\s+"
def __init__(self, it, *field):
self.it = it
self.field = field
self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))
# FlatDB::Iterator->new(
# $iterator,
# qw(address rfc931 username datetime tz method page protocol
# status bytes referrer agent)
# );
IterFlatDB(iterator,
"address rfc931 username datetime tz method page protocol status bytes referrer agent".split())
# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# sub query {
# my $self = shift;
# my ($field, $value) = @_;
# my $fieldnum = $self->{FIELDNUM}{uc $field};
# return unless defined $fieldnum;
# my $it = $self->{FH};
# # seek $fh, 0, SEEK_SET;
# # <$fh>; # discard header line
# return Iterator {
# local $_;
# while (defined ($_ = NEXTVAL($it))) {
# my @fields = split $self->{FIELDSEP};
# my $fieldval = $fields[$fieldnum];
# return $_ if $fieldval eq $value;
# }
# return;
# };
# }
def query(self, field, value):
fieldnum = self.fieldnum.get(field.upper())
if fieldnum == None:
return
for record in self.it:
fields = re.split(IterFlatDB.FIELDSEP, record)
fieldval = fields[fieldnum]
if fieldval == value:
yield record
# my $qit =
# FlatDB::Iterator->new($it, @FIELDNAMES)->query($field, $value);
qit = IterFlatDB(it, FIELDNAMES).query(field, value)
# sub readbackwards {
# my $file = shift;
# open my($fh), "|-", "tac", $file
# or return;
# return Iterator { return scalar(<$fh>) };
# }
def readbackwards(_file):
return os.popen("tac %s" % _file)
# my @fields = qw(address rfc931 username datetime tz method
# page protocol status bytes referrer agent);
# my $logfile = readbackwards("/usr/local/apache/logs/access-log")
# my $db = FlatDB::Iterator->new($logfile, @fields);
# my $q = $db->callbackquery(sub {my %F=@_; $F{PAGE}=~ m{/book/$}});
# while (1) {
# for (1..10) {
# print NEXTVAL($q);
# }
# print "q to quit; CR to continue\n";
# chomp(my $resp =);
# last if $resp =~ /q/i;
# }
fields = "address rfc931 username datetime tz method page protocol status bytes referrer agent"
logfile = readbackwards("/var/log/apache2/access.log")
db = IterFlatDB(logfile, fields.split())
q = db.callbackquery(lambda F: re.search("/book/$", f["PAGE" ]))
while 1:
for line in itertools.islice(q, 10):
print line
print "q to quit; CR to continue"
if raw_input() == 'q':
break
# my $seed = 1;
# sub Rand {
# $seed = (27*$seed+11111) & 0x7fff;
# return $seed;
# }
seed = 1
def Rand():
global seed
seed = (27*seed+11111) & 0x7fff
return seed
# sub SRand {
# $seed = shift;
# }
def SRand(_seed):
global seed
seed = _seed
# SRand($$);
SRand(os.getpid())
# use CGI::Push;
# my $seed = shift || $$ ;
# srand($seed);
# open LOG, "> $logfile" or die ... ;
# print LOG "Random seed: $seed\n";
# do_push(...);
if len(sys.argv > 1):
seed = int(sys.argv[1])
else:
seed = os.getpid()
srand(seed)
LOG = open(logfile, "w")
LOG.write("Random seed: " + str(seed))
do_push(...)
# use Foo;
# while (<>) {
# my $random = Rand();
# # do something with $random
# foo();
# }
import Foo
for line in sys.stdin:
random = Rand()
# do something with random
Foo.foo()
# sub make_rand {
# my $seed = shift || (time & 0x7fff);
# return Iterator {
# $seed = (29*$seed+11111) & 0x7fff;
# return $seed;
# }
# }
def make_rand(seed=None):
if seed == None:
seed = int(time.time()) & 0x7fff
while 1:
seed = (29*seed+11111) & 0x7fff
yield seed
# use Foo;
# my $rng = make_rand();
# while (<>) {
# my $random = NEXTVAL($rng);
# # do something with $random
# foo();
# }
import Foo
rng = make_rand()
for line in sys.stdin:
random = rng.next()
# do something with randome
Foo.foo()
### 4.4 Filters and Transforms
# sub imap {
# my ($transform, $it) = @_;
# return Iterator {
# my $next = NEXTVAL($it);
# return unless defined $next;
# return $transform->($next);
# }
# }
# itertools.imap does this already
def imap(transform, it):
for next in it:
yield transform(next)
# my $rng = imap(sub { $_[0] / 37268 }, make_rand());
rng = imap(lambda x: float(x)/37268, make_rand())
# sub imap (&$) {
# my ($transform, $it) = @_;
# return Iterator {
# my $next = NEXTVAL($it);
# return unless defined $next;
# return $transform->($next);
# }
# }
# my $rng = imap { $_[0] / 37268 } make_rand();
# sub imap (&$) {
# my ($transform, $it) = @_;
# return Iterator {
# local $_ = NEXTVAL($it);
# return unless defined $_;
# return $transform->();
# }
# }
# these are irrelevant changes for python
# sub igrep (&$) {
# my ($is_interesting, $it) = @_;
# return Iterator {
# local $_;
# while (defined ($_ = NEXTVAL($it))) {
# return $_ if $is_interesting->();
# }
# return;
# }
# }
def igrep(is_interesting, it):
for x in it:
if is_interesting(x):
yield x
# # instead of my $next_octopus =
# # interesting_files(\&contains_octopuses, 'uploads', 'downloads' ;
# )
# my $next_octopus = igrep { contains_octopuses($_) }
# dir_walk('uploads', 'downloads');
# while ($file = NEXTVAL($next_octopus)) {
# # do something with the file
# }
for _file in igrep(contains_octopuses, dir_walk("uploads", "downloads")):
# do something with the file
# sub list_iterator {
# my @items = @_;
# return Iterator {
# return shift @items;
# };
# }
def list_iterator(*args):
for x in args:
yield x
# or just
iter(args)
# sub append {
# my @its = @_;
# return Iterator {
# while (@its) {
# my $val = NEXTVAL($its[0]);
# return $val if defined $val;
# shift @its; # Discard exhausted iterator
# }
# return;
# };
def append(its):
for it in its:
for x in it:
yield x
# or just
itertools.chain(*its)
### 4.5 The Semipredicate Problem
# this whole section is irrelevant due to how
# python uses iterators/generators
# so i skipped it. it someone sees something
# in here that deserves a python translation
# let me know
### 4.6 Alternative Interfaces to Iterators
# sub equal_arrays (\@\@) {
# my ($x, $y) = @_;
# return unless @$x == @$y; # arrays are the same length?
# for my $i (0 .. $#$x) {
# return unless $x->[$i] eq $y->[$i]; # mismatched elements
# }
# return 1; # arrays are equal
# }
def equal_arrays(x,y):
if len(x) != len(y):
return False
for i in range(len(x)):
if x[i] != y[i]:
return False
return True
# but this is unnecessary since we can already do
x == y # in place
# sub equal_arrays (\@\@) {
# my ($x, $y) = @_
# return unless @$x == @$y;
# my $xy = each_array(@_ );
# while (my ($xe, $ye) = NEXTVAL($xy)) {
# return unless $xe eq $ye;
# }
# return 1;
# }
def equal_arrays(x,y):
if len(x) != len(y):
return False
xy = each_array(x,y)
for xe,ye in xy:
if xe != ye:
return False
return True
# sub each_array {
# my @arrays = @_;
# my $cur_elt = 0;
# my $max_size = 0;
# # Get the length of the longest input array
# for (@arrays) {
# $max_size = @$_ if @$_ > $max_size;
# }
# return Iterator {
# $cur_elt = 0, return () if $cur_elt >= $max_size;
# my $i = $cur_elt++;
# return map $_->[$i], @arrays;
# };
# }
def each_array(*arrays):
max_size = max(*[len(ar) for ar in arrays])
def get_item(ar, i):
if i < len(ar):
return ar[i]
return None
for i in range(max_size):
yield [get_item(ar, i) for ar in arrays]
# you could also probably do something clever with itertools.izip()
# my $buttons = each_array(\@labels, \@values);
# ...
# while (my ($label, $value) = NEXTVAL($buttons)) {
# print HTML qq{ $label
\n};
# }
buttons = each_array(labels, values)
for label, value in buttons:
HTML.write(" %(label)s
\n" % locals())
# sub each_array {
# my @arrays = @_;
# my $stop_type = ref $arrays[0] ? 'maximum' : shift @arrays;
# my $stop_size = @{$arrays[0]};
# my $cur_elt = 0;
# # Get the length of the longest (or shortest) input array
# if ($stop_type eq 'maximum') {
# for (@arrays) {
# $stop_size = @$_ if @$_ > $stop_size;
# }
# } elsif ($stop_type eq 'minimum') {
# for (@arrays) {
# $stop_size = @$_ if @$_ < $stop_size;
# }
# } else {
# croak "each_array: unknown stopping behavior '$stop_type'";
# }
# return Iterator {
# return () if $cur_elt >= $stop_size;
# my $i = $cur_elt++;
# return map $_->[$i], @arrays;
# };
# }
def each_array(arrays, stop_type="maximum"):
assert stop_type in ("minimum", "maximum")
if stop_type == "minimum":
stop_size = min(*[len(ar) for ar in arrays])
else:
stop_size = max(*[len(ar) for ar in arrays])
def get_item(ar, i):
if i < len(ar):
return ar[i]
return None
for i in range(stop_size):
yield [get_item(ar, i) for ar in arrays]
# sub eachlike (&$) {
# my ($transform, $it) = @_;
# return Iterator {
# local $_ = NEXTVAL($it);
# return unless defined $_;
# my $value = $transform->();
# return wantarray ? ($_, $value) : $value;
# }
# }
# not sure if wantarray really maps to python
# style
# package CIA;
# sub TIESCALAR {
# my $package = shift;
# my $self = {};
# bless $self => $package;
# }
# sub STORE { }
# sub FETCH { "<>" }
# tie $secret, 'CIA';
# $secret = 'atomic ray';
# print "The secret weapon is '$secret'.\n"
# the secret weapon is '<>'.
# I can't think of any reasonable way to do
# this in python. In part it seems like something
# you could handle with descriptor and in part
# with "with". I'm just going to ignore TIE-ing
# for now
### 4.7 An Extended Example: Web Spiders
# use HTML::LinkExtor;
# use LWP::Simple;
# sub traverse {
# my @queue = @_;
# my %seen;
# return Iterator {
# while (@queue) {
# my $url = shift @queue;
# $url =~ s/#.*$//;
# next if $seen{$url}++;
# my ($content_type) = head($url);
# if ($content_type =~ m{ˆtext/html\b}) {
# my $html = get($url);
# push @queue, get_links($url, $html);
# }
# return $url;
# }
# return; # exhausted
# }
# }
import urllib2
def traverse(_queue):
queue = _queue[:]
seen = {}
while queue:
url = queue.pop(0)
url = url.split("#")[0]
seen.setdefault(url,0)
if seen[url] > 0:
continue
seen[url] += 1
try:
page = urllib2.urlopen(url)
except urllib2.HTTPError:
print "http error for:", url
continue
content_type = page.headers.getheader("content-type")
if re.search(r"^text/html\b", content_type):
html = page.read()
queue.extend(get_links(url, html))
yield url
# sub get_links {
# my ($base, $html) = @_;
# my @links;
# my $more_links = sub {
# my ($tag, %attrs) = @_;
# push @links, values %attrs;
# };
# HTML::LinkExtor->new($more_links, $base)->parse($html);
# return @links;
# }
# Off the top of my head I don't know a python library
# that provides this exact functionality, so we
# fake it.
def get_links(base, html):
links = []
parsed = urlparse.urlparse(base)
for anchor in BeautifulSoup.BeautifulSoup(html)('a'):
link = anchor.get("href")
if not link:
continue
if link.startswith("./"):
link = link[2:]
if link.startswith("http"):
links.append(link)
elif link.startswith("/"):
links.append(parsed[0]+"://"+parsed[1]+link)
else:
links.append(parsed[0]+"://"+parsed[1]+parsed[2]+link)
return links
# # Version with 'interesting links' callback
# sub traverse {
# my $interesting_links = sub { @_ };
# $interesting_links = shift if ref $_[0] eq 'CODE';
# ...
# push @queue, $interesting_links->(get_links($url, $html));
# ...
# }
def traverse(queue, interesting_links=None):
...
queue.extend(interesting_links(get_links(url, html)))
...
# my $top = 'http://perl.plover.com/';
# my $interesting = sub { grep /ˆ\Q$top/o, @_ };
# my $urls = traverse($interesting, $top);
top = "http://perl.plover.com"
interesting = lambda x: top in x
urls = traverse(interesting, top)
# use File::Basename;
# while (my $url = NEXTVAL($urls)) {
# my $file = $url;
# $file =~ s/ˆ\Q$top//o;
# my $dir = dirname($file);
# system('mkdir', '-p', $dir) == 0 or next;
# open F, ">", $file or next;
# print F get($url);
# }
for url in urls:
_file = url.replace(url, "")
_dir = os.path.dirname(_file)
if os.system("mkdir -p %s" % _dir) != 0:
continue
try:
F = open(_file, "w")
else:
continue
F.write(urllib2.urlopen(url).read())
# while (my $url = NEXTVAL($urls)) {
# print "Bad link to: $url" unless head($url);
# }
for url in urls:
try:
urllib2.urlopen(url)
except:
print "Bad link to: %s" % url
# sub traverse {
# ...
# my (%head, $html);
# @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
# if ($head{TYPE} = ̃ m{ˆtext/html\b}) {
# $html = get($url);
# push @queue, $interesting_links->(get_links($url,$html));
# }
# return wantarray ? ($url, \%head, $html) : $url;
# ...
# }
# I don't think this is a straight forward way to duplicate
# "wantarray" type functionality in python. In any case
# it would be more uniform to *always* retrn the tuple
# sub traverse {
# my $interesting_links = sub { shift; @_ };
# $interesting_links = shift if ref $_[0] eq 'CODE';
# my @queue = map [$_, 'supplied by user'], @_;
# my %seen;
# return Iterator {
# while (@queue) {
# my ($url, $referrer) = @{shift @queue};
# $url =~ s/#.*$//;
# next if $seen{$url}++;
# my (%head, $html);
# @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
# if ($head{TYPE} =~ m{ˆtext/html\b}) {
# my $html = get($url);
# push @queue,
# map [$_, $url],
# $interesting_links->($url, get_links($url, $html));
# }
# return wantarray ? ($url, \%head, $referrer, $html) : $url;
# }
# return; #exhausted
# }
# }
import urllib2
def traverse(queue, interesting_links=None):
queue = [(x, "supplied by user") for x in queue]
if interesting_links == None:
def interesting_links(this_url, other_urls):
return other_urls
seen = {}
while queue:
url, referrer = queue.pop(0)
url = url.split("#")[0]
seen.setdefault(url,0)
if seen[url] > 0:
continue
seen[url] += 1
try:
page = urllib2.urlopen(url)
except urllib2.HTTPError:
print "http error for:", url
yield url, None, referrer, None
continue
content_type = page.headers.getheader("content-type")
if re.search(r"^text/html\b", content_type):
html = page.read()
queue.extend([(x, url) for x in interesting_links(url, get_links(url, html))])
yield url, page.headers, referrer, html
# my $top = 'http://perl.plover.com/'
# my $interesting = sub { shift; grep /ˆ\Q$top/o, @_ };
# my $urls = traverse($interesting, $top);
# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
# next if $head->{TYPE};
# print "Page '$referrer' has a bad link to '$url'\n";
# }
top = "http://perl.plover.com"
interesting = (lambda x,y: [_y for _y in y if top in _y])
urls = traverse([top], interesting)
for url, head, referrer, html in urls:
if not html:
continue
print "Page '%s' has a bad link to '%s'" % (referrer, url)
# my $top = 'http://perl.plover.com/';
# my $interesting = sub { shift; grep /ˆ\Q$top/o, @_ };
# my $urls = igrep_l { not $_[1]{TYPE} } traverse($interesting, $top);
# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
# print "Page '$referrer' has a bad link to '$url'\n";
# }
top = "http://perl.plover.com"
interesting = (lambda x,y: [_y for _y in y if top in _y])
urls = igrep_l((lambda url, head, referrer, html: not html), traverse([top], interesting))
for url, head, referrer, html in urls:
if not html:
continue
print "Page '%s' has a bad link to '%s'" % (referrer, url)
# sub igrep_l (&$) {
# my ($is_interesting, $it) = @_;
# return Iterator {
# while (my @vals = NEXTVAL($it)) {
# return @vals if $is_interesting->(@vals);
# }
# return;
# }
# }
def igrep_l(is_interesting, it):
for vals in it:
if is_interesting(*vals):
yield vals
# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
# print "Page '$referrer' has a bad link to '$url'\n";
# print "Edit now? ";
# my $resp = <>;
# if ($resp =~ /ˆy/i) {
# system $ENV{EDITOR}, url_to_filename($referrer);
# } elsif ($resp =~ /∧ q/i) {
# last;
# }
# }
for url, head, referrer, html in urls:
print "Page '%(referrer)s' has a bad line to '%(url)s'" % locals()
print "Edit now?"
resp = raw_input():
if resp == 'y':
os.system(os.environ["EDITOR"] + " " + url_to_filename(referrer))
elif resp == 'q':
break
# sub traverse {
# my $interesting_link;
# $interesting_link = shift if ref $_[0] eq 'CODE';
# my @queue = map [$_, 'supplied by user'], @_;
# my %seen;
# my $q_it = igrep { ! $seen{$_->[0]}++ }
# imap { $_->[0] =~ s/#.*$//; $_}
# Iterator { return shift(@queue) };
# if ($interesting_link) {
# $q_it = igrep {$interesting_link->(@$_)} $q_it;
# }
# return imap {
# my ($url, $referrer) = @$_;
# my (%head, $html);
# @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
# if ($head{TYPE} =~ m{ˆtext/html\b}) {
# $html = get($url);
# push @queue,
# map [$_, $url],
# get_links($url, $html);
# }
# return wantarray ? ($url, \%head, $referrer, $html) : $url;
# } $q_it;
# }
# this is not an exact match but is close enough for our
# purposes. what ever that purpose could be.
def traverse(queue, interesting_link=None):
seen = {}
queue = [(x, "supplied by user") for x in queue]
def iterate_queue():
while queue:
yield queue.pop(0)
def not_seen_yet(url):
seen.setdefault(url,0)
seen[url] += 1
if seen[url] > 1:
return False
return True
q_it = iterate_queue()
q_it = ((url[0].split("#")[0], url[1]) for url in q_it)
q_it = (url for url in q_it if not_seen_yet(url[0]))
if interesting_link != None:
q_it = igrep(interesting_link, q_it)
def process_url((url, referrer)):
print "process_url:", url, referrer
try:
page = urllib2.urlopen(url)
except urllib2.HTTPError:
print "http error for:", url
return url, None, referrer, None
content_type = page.headers.getheader("content-type")
if re.search(r"^text/html\b", content_type):
html = page.read()
queue.extend([(x, url) for x in get_links(url, html)])
return url, page.headers, referrer, html
return imap(process_url, q_it)
# sub make_robot_filter {
# my $agent = shift;
# my %seen_site;
# my $rules = WWW::RobotRules->new($agent);
# return sub {
# my $url = url(shift());
# return 1 unless $url->scheme eq 'http';
# unless ($seen_site{$url->netloc}++) {
# my $robots = $url->clone;
# $robots->path('/robots.txt');
# $robots->frag(undef);
# $rules->parse($robots, get($robots));
# }
# $rules->allowed($url)
# };
# }
def make_robot_filter(agent):
seen_site = {}
rules = {} #robotparser.RobotFileParser()
def _filter(url):
u = urlparse.urlparse(url)
if u.scheme != "http":
return True
if u.netloc not in rules:
rules[u.netloc] = robotparser.RobotFileParser()
rules[u.netloc].set_url(u.scheme+"://"+u.netloc+"/robots.txt")
rules[u.netloc].read()
return rules[u.netloc].can_fetch(agent, url)
return _filter
No comments:
Post a Comment