lair of the dustbunny: Higher Order Perl (Python Style) : Chapter 4

TOC

### CHAPTER 4 Iterators


### 4.1 introduction

# @lines = open('filename'); # alternate universe interface
lines = open("filename") # a less alternate universe interface


# open(FILEHANDLE, 'filename');
# while () {
#   last if /Plutonium/;
# }
# close FILEHANDLE;
# # do something with $_;
fh = open("filename")
for line in fh:
    if "Plutonium" in line:
        break
fh.close()
# do something with line




# # alternate universe interface
# @lines = open('filename');
# for (@lines) {
#   last if /Plutonium/;
# }
# # do something with $_;
lines = open("filename").readlines()
for line in lines:
    if "Plutonium" in line:
        break
fh.close()
# do something with line


# @lines = open("yes |"); # alternate universe interface
lines = os.popen("yes").readlines() # alternate universes interface



# sub parse_section {
#   my $fh = shift;
#   my $title = parse_section_title($fh);
#   my %variables = parse_variables($fh);
#   return [$title, \%variables];
# }
def parse_section(fh):
    title = parse_section_title(fh)
    variables = parse_variables(fh)
    return title, variables


# sub parse_section {
#   my @lines = @_;
#   my $title = parse_section_title(@lines);
#   my %variables = parse_variables(@lines);
#   return [$title, \%variables];
# }
# In the python case we *could* easily
# pop values from the list (but really you 
# could in perl as well)
def parse_section(lines):
    title = parse_section_title(lines)
    variables = parse_variables(lines)
    return title, variables




# opendir D, "/tmp";
# @entries = readdir D;
# In python we get a list instead of a iterator
entries = os.listdir("/tmp")


# opendir D, "/tmp";
# while (my $entry = readdir D) {
#   # Do something with $entry
# }
# Python doesn't have "scalar" mode type behavior changes
for entry in os.listdir("/tmp"):
  # Do something with $entry


# while (my $file = glob("/tmp/*.[ch]")) {
#   # Do something with $file
# }
# glob in python is also not an iterator
for _file in glob.glob("/tmp/*.[ch]"):
    # Do something with $file

# while (my $key = each %hash) {
#   # Do something with $key
# }
# depending on version of python 
# hash may automagically be an iterator
# (maybe all versions...)
for key in hash.iterkeys():
    # Do something with key




# @matches = ("12:34:56" =~ m/(\d+)/g);
matches = re.findall("(\d+)", "12:34:56")

# while ("12:34:56" = ̃ m/(\d+)/g) {
#   # do something with $1
# }
for m in re.finditer("(\d+)", "12:34:56"):
    # do something with m (where m is a "match" object)


### 4.2 Homemade Iterators

# sub dir_walk {
#   my ($dir, $filefunc, $dirfunc, $user) = @_;
#   my $iterator = make_iterator($dir);
#   while (my $filename = NEXTVAL($iterator)) {
#     if (-f $filename) { $filefunc->($filename, $user) }
#     else              {  $dirfunc->($filename, $user) }
#   }
# }
# In python os.walk returns an iterator as is
def dir_walk(dir, filefunc, dirfunc, user):
    iterator = make_iterator(dir)
    for filename in iterator:
        if os.path.isfile(filename):
            filefunc(filename, user)
        else:
            dirfunc(filename, user)


# sub upto {
#   my ($m, $n) = @_;
#   return sub {
#     return $m <= $n ? $m++ : undef;
#   };
# }
# my $it = upto(3, 5);
def upto(m,n):
    _i = [m]
    def foo():
        val = _i[0]
        _i[0] += 1
        if val > n:
            return None
        return val

    return foo

it = upto(3,5)
## of course in python it's more natural to do the following:
# def upto(m,n):
#     for x in range(m,n+1):
#         yield x


# my $nextval = $it->();
nextval = it()


# while (defined(my $val = $it->())) {
#   # now do something with $val, such as:
#   print "$val\n";
# }
# this doesn't translate in a pretty way to python
# since we can't have statements in a while 
# context
val = it()
while val != None:
    # now do something with val, such as:
    print val
    val = it()
# but of course we'd just use:
for val in it:
    print val



# for my $val (1 .. 10000000) {
#    # now do something with $val
# }
for val in range(1, 10000000):
    # now do something with val

# package Iterator_Utils;
# use base Exporter;
# @EXPORT_OK = qw(NEXTVAL Iterator
#                 append imap igrep
#                 iterate_function filehandle_iterator list_iterator);
# %EXPORT_TAGS = ('all' => \@EXPORT_OK);
# sub NEXTVAL { $_[0]->() }

# my $nextval = NEXTVAL($it);

# while (defined(my $val = NEXTVAL($it))) {
#   # now do something with $val
# }

# No need to do these machinations since this is already built into
# python except we'd do this with "for"
for val in it:
    # not do something with val


# sub upto {
#   my ($m, $n) = @_;
#   return Iterator {
#     return $m <= $n ? $m++ : undef;
#   };
# }
# sub Iterator (&) { return $_[0] }
# in python we just do this with a yield
def upto(m, n):
    i = m
    while i <= n:
        yield i
        i += 1


# # iterator version
# sub dir_walk {
#   my @queue = shift;
#   return Iterator {
#     while (@queue) {
#       my $file = shift @queue;
#       if (-d $file) {
#         opendir my $dh, $file or next;
#         my @newfiles = grep {$_ ne "." && $_ ne ".."} readdir $dh;
#         push @queue, map "$file/$_", @newfiles;
#       }
#       return $file;
#     } else {
#       return;
#     }
#   };
# }
def dir_walk(root):
    queue = [root]
    while queue:
        _file = queue.pop(0)
        if os.path.isdir(_file):
            for newfile in os.listdir(_file):
                queue.append(os.path.join(_file, newfile))
        yield _file


                
                                
# sub dir_walk {
#   my ($top, $code) = @_;
#   my $DIR;
#   $code->($top);
#   if (-d $top) {
#     my $file;
#     unless (opendir $DIR, $top) {
#       warn "Couldn’t open directory $top: $!; skipping.\n";
#       return;
#     }
#     while ($file = readdir $DIR) {
#       next if $file eq '.'|| $file eq '..'
#       dir_walk("$top/$file", $code);
#     }
#   }
# }
def dir_walk(top, code):
    code(top)
    if os.path.isdir(top):
        try:
            for _file in os.listdir(top):
                dir_walk(os.path.join(top,_file), code)
        except StandardError, why:
            print "Couldn't open directory %s: %s" % (top, why)
            return


### 4.3 Examples

# sub interesting_files {
#   my $is_interesting = shift;
#   my @queue = @_;
#   return Iterator {
#     while (@queue) {
#       my $file = shift @queue;
#       if (-d $file) {
#         opendir my $dh, $file or next;
#         my @newfiles = grep {$_ ne "." && $_ ne ".."} readdir $dh;
#         push @queue, map "$file/$_", @newfiles;
#       }
#       return $file if $is_interesting->($file);
#     }
#     return;
#   };
# }
def interesting_files(is_interesting, *top_dirs):
    queue = list(top_dirs)
    
    while queue:
        _file = queue.pop(0)
        if os.path.isdir(_file):
            for newfile in os.listdir(_file):
                queue.append(os.path.join(_file, newfile))
        if is_interesting(_file):
            yield _file        


# # Files are deemed to be interesting if they mention octopuses
# sub contains_octopuses {
#   my $file = shift;
#   return unless -T $file && open my($fh), "<", $file;
#   while (<$fh>) {
#     return 1 if /octopus/i;
#   }
#   return;
# }
# my $octopus_file =
#   interesting_files(\&contains_octopuses, 'uploads', 'downloads');
# while ($file = NEXTVAL($octopus_file)) {
#   # do something with the file
# }
# if (NEXTVAL($next_octopus)) {
#   # yes, there is an interesting file
# } else {
#   # no, there isn’t.
# }
# undef $next_octopus;
def contains_octopuses(_file):
    if not os.path.isfile(_file):
        return False
    for line in file(_file):
        if "octopus" in line:
            return True
    return False
octopus_file = interesting_files(contains_octopuses, "uploads", "downloads")
for _octopus_file in octopus_file:
    # do something with the file
try:
    next_octopus.next()
    # yes, there is an interesting file
except StopIteration:
    # no there isn't
del next_octopus
    


# sub permute {
#     my @items = @{ $_[0] };
#     my @perms = @{ $_[1] };
#     unless (@items) {
#         print "@perms\n";
#     } else {
#         my(@newitems,@newperms,$i);
#         foreach $i (0 .. $#items) {
#             @newitems = @items;
#             @newperms = @perms;
#             unshift(@newperms, splice(@newitems, $i, 1));
#             permute([@newitems], [@newperms]);
#         }
#     }
# }
# # sample call:
# permute([qw(red yellow blue green)], []);
# I suspect I don't have this quite right
# it produces the permuations but doesn't
# have the problem of waiting for the end to
# start showing the permutations
def permute(items, perms):
    if not items:
        print perms
    else:
        for i in range(len(items)):
            newitems = items[:]
            newitem = newitems.pop(i)
            newperms = [newperm+[newitem] for newperm in perms] or [[newitem]]
            permute(newitems, newperms)
# sample call
permute(["red", "yello", "blue", "green"], [])



# my $it = permute('A'..'D');
# while (my @p = NEXTVAL($it)) {
#   print "@p\n";
# }
it = permute(["A","B","C","D"])
for p in it:
    print p



# sub permute {
#   my @items = @_;
#   my @pattern = (0) x @items;
#   return Iterator {
#     return unless @pattern;
#     my @result = pattern_to_permutation(\@pattern, \@items);
#     @pattern = increment_pattern(@pattern);
#     return @result;
#   };
# }
def permute(items):
    pattern = [0] * len(items)
    
    while pattern:
        result = pattern_to_permutation(pattern, items)
        pattern = increment_pattern(pattern)
        yield result


# sub pattern_to_permutation {
#   my $pattern = shift;
#   my @items = @{shift()};
#   my @r;
#   for (@$pattern) {
#     push @r, splice(@items, $_, 1);
#   }
#   @r;
# }
def pattern_to_permutation(pattern, items):
    items = items[:]
    r = []
    for _x in pattern:
        r.append(items.pop(_x))
    return r 



# sub increment_odometer {
#   my @odometer = @_;
#   my $wheel = $#odometer;    # start at rightmost wheel
#   until ($odometer[$wheel] < 9 || $wheel < 0) {
#     $odometer[$wheel] = 0;
#     $wheel--; # next wheel to the left
#   }
#   if ($wheel < 0) {
#     return;   # fell off the left end; no more sequences
#   } else {
#     $odometer[$wheel]++;  # this wheel now turns one notch
#     return @odometer;
#   }
# }
def increment_odometer(odometer):
    wheel = len(odometer) - 1
    while not (odometer[wheel] < 9 or wheel < 0):
        odometer[wheel] = 0
        wheel -= 1
    if wheel < 0:
        return
    else:
        odometer[wheel] += 1
        return odometer



# sub increment_pattern {
#   my @odometer = @_;
#   my $wheel = $#odometer;    # start at rightmost wheel
#   until ($odometer[$wheel] < $#odometer-$wheel || $wheel < 0) {
#     $odometer[$wheel] = 0;
#     $wheel--; # next wheel to the left
#   }
#   if ($wheel < 0) {
#     return;   # fell off the left end; no more sequences
#   } else {
#     $odometer[$wheel]++; # this wheel now turns one notch
#     return @odometer;
#   }
# }
def increment_pattern(odometer):
    wheel = len(odometer) - 1
    while not (odometer[wheel] < (len(odometer)-1-wheel) or wheel < 0):
        odometer[wheel] = 0
        wheel -= 1
    if wheel < 0:
        return
    else:
        odometer[wheel] += 1
        return odometer


# sub n_to_pat {
#   my @odometer;
#   my ($n, $length) = @_;
#   for my $i (1 .. $length) {
#     unshift @odometer, $n % $i;
#     $n = int($n/$i);
#   }
#   return $n ? () : @odometer;
# }
def n_to_pat(n, length):
    odometer = []
    for i in range(1, length+1):
        odometer.insert(0, n % i)
        n = n / i
    return not n and odometer or []


# sub permute {
#   my @items = @_;
#   my $n = 0;
#   return Iterator {
#     my @pattern = n_to_pat($n, scalar(@items));
#     my @result = pattern_to_permutation(\@pattern, \@items);
#     $n++;
#     return @result;
#   };
# }
def permute(items):
    n = 0
    while 1:
        pattern = n_to_pat(n, len(items))
        if not pattern:
            break
        result = pattern_to_permutation(pattern, items)
        yield result
        n += 1


# sub iterate_function {
#   my $n = 0;
#   my $f = shift;
#   return Iterator {
#     return $f->($n++);
#   };
# }
def iterate_function(f):
    n = 0
    while 1:
        yield f(n)
        n += 1


# sub permute {
#   my @items = @_;
#   my $n = 0;
#   return Iterator {
#     $n++, return @items if $n==0;
#     my $i;
#     my $p = $n;
#     for ($i=1; $i<=@items && $p%$i==0; $i++) {
#       $p /= $i;
#     }
#     my $d = $p % $i;
#     my $j = @items - $i;
#     return if $j < 0;
#     @items[$j+1..$#items] = reverse @items[$j+1..$#items];
#     @items[$j,$j+$d] = @items[$j+$d,$j];
#     $n++;
#     return @items;
#   };
# }
 def permute(_items):
    n = 0
    items = _items[:]

    if n == 0:
        yield items

    n += 1
    while 1:
        # make a copy so list(permute(my_list)) returns n copies of same item
        # otherwise can remove
        items = items[:] 
        i = 1
        p = n
        while i <= len(items)+1 and p % i == 0:
            p /= i
            i += 1
        d = p % i
        j = len(items) - i 

        if j < 0:
            return

        items[j+1:len(items)] = reversed(items[j+1:len(items)])
        x,y = items[j+d], items[j]
        items[j] = x
        items[j+d] = y 
        n += 1

        yield items



# sub make_genes {
#   my $pat = shift;
#   my @tokens = split /[()]/, $pat;
#   for (my $i = 1; $i < @tokens; $i += 2) {
#     $tokens[$i] = [0, split(//, $tokens[$i])];
#   }
#   my $FINISHED = 0;
#   return Iterator {
#     return if $FINISHED;
#     my $finished_incrementing = 0;
#     my $result = "";
#     for my $token (@tokens) {
#       if (ref $token eq "") {    # plain string
#         $result .= $token;
#       } else {                   # wildcard
#         my ($n, @c) = @$token;
#         $result .= $c[$n];
#         unless ($finished_incrementing) {
#           if ($n == $#c) { $token->[0] = 0 }
#           else { $token->[0]++; $finished_incrementing = 1 }
#         }
#       }
#     }
#     $FINISHED = 1 unless $finished_incrementing;
#     return $result;
#   }
# }
def make_genes(pat):
    tokens = re.split("[()]",pat)

    for i in range(len(tokens))[1::2]:
        tokens[i] = [0] + list(tokens[i])

    FINISHED = False
    while not FINISHED:
        finished_incrementing = False
        result = ""
        for token in tokens:
            if token.__class__ is str:
                result += token
            else:
                n, c = token[0], token[1:]
                result += c[n]
                if not finished_incrementing:
                    if n == len(c) - 1:
                        token[0] = 0
                    else:
                        token[0] += 1
                        finished_incrementing = True
        if not finished_incrementing:
            FINISHED = True
        yield result




# %n_expand = qw(N ACGT 
#                B CGT D AGT H ACT V ACG
#                K GT M AC R AG S CG W AT Y CT);
# sub make_dna_sequences {
#   my $pat = shift;
#   for my $abbrev (keys %n_expand) {
#     $pat =~ s/$abbrev/($n_expand{$abbrev})/g;
#   }
#   return make_genes($pat);
# }
n_expand = {"N" : "ACGT",
            "B" : "CGT", "D" : "AGT", "H" : "ACT", "V" : "ACG",
            "K" : "GT",  "M" : "AC",  "R" : "AG",  "S" : "CG",  "W" : "AT", "Y" : "CT"}
def make_dna_sequences(pat):
    for abbrev in n_expand:
        pat = re.sub(abbrev, n_expand[abbrev], pat)
    
    return make_genes(pat)



# sub filehandle_iterator {
#   my $fh = shift;
#   return Iterator { <$fh> };
# }

# my $it = filehandle_iterator(*STDIN);
# while (defined(my $line = NEXTVAL($it))) {
#   # do something with $line
# }
### python already does this by default
for line in file("foo"):
    # do something with line


# LASTNAME:FIRSTNAME:CITY:STATE:OWES
# Adler:David:New York:NY:157.00
# Ashton:Elaine:Boston:MA:0.00
# Dominus:Mark:Philadelphia:PA:0.00
# Orwant:Jon:Cambridge:MA:26.30
# Schwern:Michael:New York:NY:149658.23
# Wall:Larry:Mountain View:CA:-372.14


# package FlatDB;
# my $FIELDSEP = qr/:/;
# sub new {
#   my $class = shift;
#   my $file = shift;
#   open my $fh, "<", $file or return;
#   chomp(my $schema = <$fh>);

#   my @field = split $FIELDSEP, $schema;
#   my %fieldnum = map { uc $field[$_] => $_ } (0..$#field);
#   bless { FH => $fh, FIELDS => \@field, FIELDNUM => \%fieldnum,
#           FIELDSEP => $FIELDSEP } => $class;
# }
class FlatDB(object):
    FIELDSEP = ":"

    def __init__(self, _file):
        self._file = _file
        self.fh = file(self._file)
        self.schema = self.fh.readline().strip()
        self.field = self.schema.split(FlatDB.FIELDSEP)
        self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))
  
# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# use Fcntl ':seek';
# sub query {
#   my $self = shift;
#   my ($field, $value) = @_;
#   my $fieldnum = $self->{FIELDNUM}{uc $field};
#   return unless defined $fieldnum;
#   my $fh = $self->{FH};
#   seek $fh, 0, SEEK_SET;
#   <$fh>;                # discard schema line
#   return Iterator {
#     local $_;
#     while (<$fh>) {
#       chomp;
#       my @fields = split $self->{FIELDSEP}, $_, -1;
#       my $fieldval = $fields[$fieldnum];
#       return $_ if $fieldval eq $value;
#     }
#     return;
#   };
# }

    def query(self, field, value):
        fieldnum = self.fieldnum.get(field.upper())
        if fieldnum == None:
            return
        fh = self.fh
        fh.seek(0)
        fh.readline() # discard schema line
        for line in fh:
            fields = line.split(FlatDB.FIELDSEP)
            fieldval = fields[fieldnum]
            if fieldval == value:
                yield line.strip()



# use FlatDB;
# my $dbh = FlatDB->new('db.txt') or die $!;
# my $q = $dbh->query('STATE', 'NY');
# while (my $rec = NEXTVAL($q)) {
#   print $rec;
# }
dbh = FlatDB("db.txt")
q = dbh.query("STATE", "NY")
for rec in q:
    print rec




# my $q = $dbh->callbackquery(sub { my %F=@_; $F{OWES} > 10 });
# my $q = $dbh->callbackquery(sub { my %F=@_; $F{FIRSTNAME} =~ /ˆM/ });

# use Fcntl ':seek';
# sub callbackquery {
#   my $self = shift;
#   my $is_interesting = shift;
#   my $fh = $self->{FH};
#   seek $fh, 0, SEEK_SET;
#   <$fh>;                # discard header line
#   return Iterator {
#     local $_;
#     while (<$fh>) {
#       chomp;
#        my %F;
#        my @fieldnames = @{$self->{FIELDS}};
#        my @fields = split $self->{FIELDSEP};
#        for (0 .. $#fieldnames) {
#          $F{$fieldnames[$_]} = $fields[$_];
#        }
#        return $_ if $is_interesting->(%F);
#     }
#     return;
#   }
# }

q = dbh.callbackquery(lambda F: F["OWES"] > 10)
q = dbh.callbackquery(lambda F: F["FIRSTNAME"].startswith("M") )

def callbackquery(self, is_interesting):
    fh = self.fh
    fh.seek(0)
    fh.readline() # discard schema line
    for line in fh:
        line = line.strip()
        fieldnames = self.field
        fields = line.split(FlatDB.FIELDSEP)
        F = dict(zip(fieldnames, fields))
        if is_interesting(F):
            yield line


# use FlatDB;
# my $dbh = FlatDB->new('db.txt') or die $!;
# my $q1 = $dbh->query('STATE', 'MA');
# my $q2 = $dbh->query('STATE', 'NY');
# for (1..2) {
#   print NEXTVAL($q1), NEXTVAL($q2);
# }
dbh = FlatDB("db.txt")
q1 = dbh.query("STATE","MA")
q2 = dbh.query("STATE","NY")
for x in range(1,3):
    print q1.next(), q2.next()


# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# use Fcntl ':seek';
# sub query {
#   my $self = shift;
#   my ($field, $value) = @_;
#   my $fieldnum = $self->{FIELDNUM}{uc $field};
#   return unless defined $fieldnum;
#   my $fh = $self->{FH};
#   seek $fh, 0, SEEK_SET;
#   <$fh>;                # discard header line
#   my $position = tell $fh;
#   return Iterator {
#     local $_;
#     seek $fh, $position, SEEK_SET;
#     while (<$fh>) {
#       chomp;
#       $position = tell $fh;
#       my @fields = split $self->{FIELDSEP};
#       my $fieldval = $fields[$fieldnum];
#       return $_ if $fieldval eq $value;
#     }
#     return;
#   };
# }
# # callbackquery with bug fix
# use Fcntl ':seek';
# sub callbackquery {
#   my $self = shift;
#   my $is_interesting = shift;
#   my $fh = $self->{FH};
#   seek $fh, 0, SEEK_SET;
#   <$fh>;                # discard header line
#   my $position = tell $fh;
#   return Iterator {
#     local $_;
#     seek $fh, $position, SEEK_SET;
#     while (<$fh>) {
#       $position = tell $fh;
#       my %F;
#       my @fieldnames = @{$self->{FIELDS}};
#       my @fields = split $self->{FIELDSEP};
#       for (0 .. $#fieldnames) {
#         $F{$fieldnames[$_]} = $fields[$_];
#       }
#       return $_ if $is_interesting->(%F);

#     }
#     return;
#   };
# }
# 1;

class FlatDB(object):
    FIELDSEP = ":"

    def __init__(self, _file):
        self._file = _file
        self.fh = file(self._file)
        self.schema = self.fh.readline().strip()
        self.field = self.schema.split(FlatDB.FIELDSEP)
        self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))

    def query(self, field, value):
        fieldnum = self.fieldnum.get(field.upper())
        if fieldnum == None:
            return
        fh = self.fh
        fh.seek(0)
        fh.readline() # discard schema line
        while 1:
            line = fh.readline()
            if not line:
                break
            position = fh.tell()
            fields = line.split(FlatDB.FIELDSEP)
            fieldval = fields[fieldnum]
            if fieldval == value:
                yield line.strip()
            fh.seek(position)

    def callbackquery(self, is_interesting):
        fh = self.fh
        fh.seek(0)
        fh.readline() # discard schema line
        while 1:
            line = fh.readline()
            if not line:
                break
            position = fh.tell()
            line = line.strip()
            fieldnames = self.field
            fields = line.split(FlatDB.FIELDSEP)
            F = dict(zip(fieldnames, fields))
            if is_interesting(F):
                yield line
            fh.seek(position)




# package FlatDB::Iterator;
# my $FIELDSEP = qr/\s+/;
# sub new {
#   my $class = shift;
#   my $it = shift;
#   my @field = @_;
#   my %fieldnum = map { uc $field[$_] => $_ } (0..$#field);
#   bless { FH => $it, FIELDS => \@field, FIELDNUM => \%fieldnum,
class IterFlatDB(object):
    FIELDSEP = "\s+"

    def __init__(self, it, *field):
        self.it = it
        self.field = field
        self.fieldnum = dict(zip([x.upper() for x in self.field], range(len(self.field))))



# FlatDB::Iterator->new(
#   $iterator,
#   qw(address rfc931 username datetime tz method page protocol
#      status bytes referrer agent)
# );
IterFlatDB(iterator, 
           "address rfc931 username datetime tz method page protocol status bytes referrer agent".split())





# # usage: $dbh->query(fieldname, value)
# # returns all records for which (fieldname) matches (value)
# sub query {
#   my $self = shift;
#   my ($field, $value) = @_;
#   my $fieldnum = $self->{FIELDNUM}{uc $field};
#   return unless defined $fieldnum;
#   my $it = $self->{FH};
#   # seek $fh, 0, SEEK_SET;
#   # <$fh>;                # discard header line
#   return Iterator {
#     local $_;
#     while (defined ($_ = NEXTVAL($it))) {
#       my @fields = split $self->{FIELDSEP};
#       my $fieldval = $fields[$fieldnum];
#       return $_ if $fieldval eq $value;
#     }
#     return;
#   };
# }
def query(self, field, value):
    fieldnum = self.fieldnum.get(field.upper())
    if fieldnum == None:
        return

    for record in self.it:
        fields = re.split(IterFlatDB.FIELDSEP, record)
        fieldval = fields[fieldnum]
        if fieldval == value:
            yield record




# my $qit =
#   FlatDB::Iterator->new($it, @FIELDNAMES)->query($field, $value);
qit = IterFlatDB(it, FIELDNAMES).query(field, value)



# sub readbackwards {
#   my $file = shift;
#   open my($fh), "|-", "tac", $file
#     or return;
#   return Iterator { return scalar(<$fh>) };
# }
def readbackwards(_file):
    return os.popen("tac %s" % _file)


# my @fields = qw(address rfc931 username datetime tz method
#                 page protocol status bytes referrer agent);
# my $logfile = readbackwards("/usr/local/apache/logs/access-log")
# my $db = FlatDB::Iterator->new($logfile, @fields);
# my $q = $db->callbackquery(sub {my %F=@_; $F{PAGE}=~ m{/book/$}});
# while (1) {
#   for (1..10) {
#     print NEXTVAL($q);
#   }
#   print "q to quit; CR to continue\n";
#   chomp(my $resp = );
#   last if $resp =~ /q/i;
# }
fields = "address rfc931 username datetime tz method page protocol status bytes referrer agent"
logfile = readbackwards("/var/log/apache2/access.log")
db = IterFlatDB(logfile, fields.split())
q = db.callbackquery(lambda F: re.search("/book/$", f["PAGE" ]))

while 1:
    for line in itertools.islice(q, 10):
        print line
    print "q to quit; CR to continue"
    if raw_input() == 'q':
        break
        





# my $seed = 1;
# sub Rand {
#   $seed = (27*$seed+11111) & 0x7fff;
#   return $seed;
# }
seed = 1
def Rand():
    global seed
    seed = (27*seed+11111) & 0x7fff
    return seed


# sub SRand {
#   $seed = shift;
# }
def SRand(_seed):
    global seed
    seed = _seed


# SRand($$);
SRand(os.getpid())


# use CGI::Push;
# my $seed = shift || $$ ;
# srand($seed);
# open LOG, "> $logfile" or die ... ;
# print LOG "Random seed: $seed\n";
# do_push(...);
if len(sys.argv > 1):
    seed = int(sys.argv[1])
else: 
    seed = os.getpid()
srand(seed)
LOG = open(logfile, "w")
LOG.write("Random seed: " + str(seed))
do_push(...)


# use Foo;
# while (<>) {
#   my $random = Rand();
#   # do something with $random
#   foo();
# }
import Foo
for line in sys.stdin:
    random = Rand()
    # do something with random
    Foo.foo()




# sub make_rand {
#   my $seed = shift || (time & 0x7fff);
#   return Iterator {
#     $seed = (29*$seed+11111) & 0x7fff;
#     return $seed;
#   }
# }
def make_rand(seed=None):
    if seed == None:
        seed = int(time.time()) & 0x7fff
    while 1:
        seed = (29*seed+11111) & 0x7fff
        yield seed



# use Foo;
# my $rng = make_rand();
# while (<>) {
#   my $random = NEXTVAL($rng);
#   # do something with $random
#   foo();
# }
import Foo
rng = make_rand()
for line in sys.stdin:
    random = rng.next()
    # do something with randome
    Foo.foo()



### 4.4 Filters and Transforms

# sub imap {
#   my ($transform, $it) = @_;
#   return Iterator {
#     my $next = NEXTVAL($it);
#     return unless defined $next;
#     return $transform->($next);
#   }
# }

# itertools.imap does this already
def imap(transform, it):
    for next in it:
        yield transform(next)


# my $rng = imap(sub { $_[0] / 37268 }, make_rand());
rng = imap(lambda x: float(x)/37268, make_rand())


# sub imap (&$) {
#   my ($transform, $it) = @_;
#   return Iterator {
#     my $next = NEXTVAL($it);
#     return unless defined $next;
#     return $transform->($next);
#   }
# }

# my $rng = imap { $_[0] / 37268 } make_rand();

# sub imap (&$) {
#   my ($transform, $it) = @_;
#   return Iterator {
#      local $_ = NEXTVAL($it);
#      return unless defined $_;
#      return $transform->();
#   }
# }

# these are irrelevant changes for python



# sub igrep (&$) {
#   my ($is_interesting, $it) = @_;
#   return Iterator {
#     local $_;
#     while (defined ($_ = NEXTVAL($it))) {
#       return $_ if $is_interesting->();
#     }
#     return;
#   }
# }
def igrep(is_interesting, it):
    for x in it:
        if is_interesting(x):
            yield x


# # instead of         my $next_octopus =
# #    interesting_files(\&contains_octopuses, 'uploads', 'downloads' ;
#                                                                    )
# my $next_octopus = igrep { contains_octopuses($_) }
#                        dir_walk('uploads', 'downloads');
# while ($file = NEXTVAL($next_octopus)) {
#   # do something with the file
# }
for _file in igrep(contains_octopuses, dir_walk("uploads", "downloads")):
    # do something with the file



# sub list_iterator {
#   my @items = @_;
#   return Iterator {
#     return shift @items;
#   };
# }
def list_iterator(*args):
    for x in args:
        yield x
# or just
iter(args)



# sub append {
#   my @its = @_;
#   return Iterator {
#     while (@its) {
#     my $val = NEXTVAL($its[0]);
#     return $val if defined $val;
#     shift @its;  # Discard exhausted iterator
#   }
#   return;
# };
def append(its):
    for it in its:
        for x in it:
            yield x
# or just
itertools.chain(*its)



### 4.5 The Semipredicate Problem

# this whole section is irrelevant due to how 
# python uses iterators/generators 
# so i skipped it.  it someone sees something
# in here that deserves a python translation
# let me know

### 4.6 Alternative Interfaces to Iterators


# sub equal_arrays (\@\@) {
#   my ($x, $y) = @_;
#   return unless @$x == @$y;     # arrays are the same length?
#   for my $i (0 .. $#$x) {
#     return unless $x->[$i] eq $y->[$i];   # mismatched elements
#   }
#   return 1;                     # arrays are equal
# }
def equal_arrays(x,y):
    if len(x) != len(y):
        return False
    for i in range(len(x)):
        if x[i] != y[i]:
            return False
    return True
# but this is unnecessary since we can already do
x == y # in place



# sub equal_arrays (\@\@) {
#   my ($x, $y) = @_
#   return unless @$x == @$y;
#   my $xy = each_array(@_ );
#   while (my ($xe, $ye) = NEXTVAL($xy)) {
#     return unless $xe eq $ye;
#   }
#   return 1;
# }
def equal_arrays(x,y):
    if len(x) != len(y):
        return False
    
    xy = each_array(x,y)
    for xe,ye in xy:
        if xe != ye:
            return False
    return True
    

# sub each_array {
#   my @arrays = @_;
#   my $cur_elt = 0;
#   my $max_size = 0;
#   # Get the length of the longest input array
#   for (@arrays) {
#     $max_size = @$_ if @$_ > $max_size;
#   }
#   return Iterator {
#     $cur_elt = 0, return () if $cur_elt >= $max_size;
#     my $i = $cur_elt++;
#     return map $_->[$i], @arrays;
#   };
# }
def each_array(*arrays):
    max_size = max(*[len(ar) for ar in arrays])
    
    def get_item(ar, i):
        if i < len(ar):
            return ar[i]
        return None

    for i in range(max_size):
        yield [get_item(ar, i) for ar in arrays]
# you could also probably do something clever with itertools.izip() 



# my $buttons = each_array(\@labels, \@values);
# ...
# while (my ($label, $value) = NEXTVAL($buttons)) {
#   print HTML qq{ $label
\n};
# }
buttons = each_array(labels, values)
for label, value in buttons:
    HTML.write(" %(label)s
\n" % locals())



# sub each_array {
#   my @arrays    = @_;
#   my $stop_type = ref $arrays[0] ? 'maximum' : shift @arrays;
#   my $stop_size = @{$arrays[0]};
#   my $cur_elt   = 0;
#   # Get the length of the longest (or shortest) input array
#   if ($stop_type eq 'maximum') {
#     for (@arrays) {
#       $stop_size = @$_ if @$_ > $stop_size;
#     }
#   } elsif ($stop_type eq 'minimum') {
#     for (@arrays) {
#       $stop_size = @$_ if @$_ < $stop_size;
#     }
#   } else {
#     croak "each_array: unknown stopping behavior '$stop_type'";
#   }
#   return Iterator {
#     return ()  if $cur_elt >= $stop_size;
#     my $i = $cur_elt++;
#     return map $_->[$i], @arrays;
#   };
# }
def each_array(arrays, stop_type="maximum"):
    assert stop_type in ("minimum", "maximum")
    
    if stop_type == "minimum":
        stop_size = min(*[len(ar) for ar in arrays])
    else:
        stop_size = max(*[len(ar) for ar in arrays])
    
    def get_item(ar, i):
        if i < len(ar):
            return ar[i]
        return None

    for i in range(stop_size):
        yield [get_item(ar, i) for ar in arrays]



# sub eachlike (&$) {
#   my ($transform, $it) = @_;
#   return Iterator {
#     local $_ = NEXTVAL($it);
#     return unless defined $_;
#     my $value = $transform->();
#     return wantarray ? ($_, $value) : $value;
#   }
# }
# not sure if wantarray really maps to python
# style



# package CIA;
# sub TIESCALAR {
#   my $package = shift;
#   my $self = {};
#   bless $self => $package;
# }
# sub STORE { }
# sub FETCH { "<>" }

# tie $secret, 'CIA';

# $secret = 'atomic ray';

# print "The secret weapon is '$secret'.\n"

# the secret weapon is '<>'.

# I can't think of any reasonable way to do
# this in python.  In part it seems like something
# you could handle with descriptor and in part
# with "with".  I'm just going to ignore TIE-ing
# for now


### 4.7 An Extended Example: Web Spiders


# use HTML::LinkExtor;
# use LWP::Simple;
# sub traverse {
#   my @queue = @_;
#   my %seen;
#   return Iterator {
#     while (@queue) {
#       my $url = shift @queue;
#       $url =~ s/#.*$//;
#       next if $seen{$url}++;
#       my ($content_type) = head($url);
#       if ($content_type =~ m{ˆtext/html\b}) {
#         my $html = get($url);
#         push @queue, get_links($url, $html);
#       }
#       return $url;
#     }
#     return;     # exhausted
#   }
# }
import urllib2    
def traverse(_queue):
    queue = _queue[:]
    seen = {}
    while queue:
        url = queue.pop(0)
        url = url.split("#")[0]
        seen.setdefault(url,0)
        if seen[url] > 0:
            continue
        seen[url] += 1
        try:
            page = urllib2.urlopen(url)
        except urllib2.HTTPError:
            print "http error for:", url
            continue
        content_type = page.headers.getheader("content-type")
        if re.search(r"^text/html\b", content_type):
            html = page.read()
            queue.extend(get_links(url, html))
        yield url
            

# sub get_links {
#   my ($base, $html) = @_;
#   my @links;
#   my $more_links = sub {
#     my ($tag, %attrs) = @_;
#     push @links, values %attrs;
#   };
#   HTML::LinkExtor->new($more_links, $base)->parse($html);
#   return @links;
# }
# Off the top of my head I don't know a python library
# that provides this exact functionality, so we 
# fake it.
def get_links(base, html):
    links = []

    parsed = urlparse.urlparse(base)

    for anchor in BeautifulSoup.BeautifulSoup(html)('a'):
        link = anchor.get("href")
        if not link:
            continue

        if link.startswith("./"):
            link = link[2:]

        if link.startswith("http"):
            links.append(link)
        elif link.startswith("/"):
            links.append(parsed[0]+"://"+parsed[1]+link)
        else:
            links.append(parsed[0]+"://"+parsed[1]+parsed[2]+link)

    return links



# # Version with 'interesting links' callback
# sub traverse {
#   my $interesting_links = sub { @_ };
#   $interesting_links = shift if ref $_[0] eq 'CODE';
#   ...
#         push @queue, $interesting_links->(get_links($url, $html));
#   ...
# }
def traverse(queue, interesting_links=None):
    ...
    queue.extend(interesting_links(get_links(url, html)))
    ...



# my $top = 'http://perl.plover.com/';
# my $interesting = sub { grep /ˆ\Q$top/o, @_ };
# my $urls = traverse($interesting, $top);
top = "http://perl.plover.com"
interesting = lambda x: top in x
urls = traverse(interesting, top)



# use File::Basename;
# while (my $url = NEXTVAL($urls)) {
#   my $file = $url;
#   $file =~ s/ˆ\Q$top//o;
#   my $dir = dirname($file);
#   system('mkdir', '-p', $dir) == 0 or next;
#   open F, ">", $file or next;
#   print F get($url);
# }
for url in urls:
    _file = url.replace(url, "")
    _dir = os.path.dirname(_file)
    if os.system("mkdir -p %s" % _dir) != 0:
        continue
    try:
        F = open(_file, "w")
    else:
        continue
    F.write(urllib2.urlopen(url).read())




# while (my $url = NEXTVAL($urls)) {
#   print "Bad link to: $url" unless head($url);
# }
for url in urls:
    try:
        urllib2.urlopen(url)
    except:
        print "Bad link to: %s" % url



# sub traverse {
#       ...
#       my (%head, $html);
#       @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
#       if ($head{TYPE} = ̃ m{ˆtext/html\b}) {
#         $html = get($url);
#         push @queue, $interesting_links->(get_links($url,$html));
#       }
#       return wantarray ? ($url, \%head, $html) : $url;
#       ...
# }
# I don't think this is a straight forward way to duplicate
# "wantarray" type functionality in python.  In any case
# it would be more uniform to *always* retrn the tuple


# sub traverse {
#   my $interesting_links = sub { shift; @_ };
#   $interesting_links = shift if ref $_[0] eq 'CODE';
#   my @queue = map [$_, 'supplied by user'], @_;
#   my %seen;
#   return Iterator {
#     while (@queue) {
#       my ($url, $referrer) = @{shift @queue};
#       $url =~ s/#.*$//;
#       next if $seen{$url}++;
#       my (%head, $html);
#       @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
#       if ($head{TYPE} =~ m{ˆtext/html\b}) {
#         my $html = get($url);
#         push @queue,
#           map [$_, $url],
#             $interesting_links->($url, get_links($url, $html));
#       }
#       return wantarray ? ($url, \%head, $referrer, $html) : $url;
#     }
#     return;     #exhausted
#   }
# }
import urllib2    
def traverse(queue, interesting_links=None):
    queue = [(x, "supplied by user") for x in queue]

    if interesting_links == None:
        def interesting_links(this_url, other_urls):
            return other_urls

    seen = {}

    while queue:
        url, referrer = queue.pop(0)
        url = url.split("#")[0]
        seen.setdefault(url,0)
        if seen[url] > 0:
            continue
        seen[url] += 1
        try:
            page = urllib2.urlopen(url)
        except urllib2.HTTPError:
            print "http error for:", url
            yield url, None, referrer, None
            continue
        content_type = page.headers.getheader("content-type")
        if re.search(r"^text/html\b", content_type):
            html = page.read()
            queue.extend([(x, url) for x in interesting_links(url, get_links(url, html))])
        yield url, page.headers, referrer, html
            


# my $top = 'http://perl.plover.com/'
# my $interesting = sub { shift; grep /ˆ\Q$top/o, @_ };
# my $urls = traverse($interesting, $top);
# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
#   next if $head->{TYPE};
#   print "Page '$referrer' has a bad link to '$url'\n";
# }
top = "http://perl.plover.com"
interesting = (lambda x,y: [_y for _y in y if top in _y])
urls = traverse([top], interesting)
for url, head, referrer, html in urls:
    if not html:
        continue
    print "Page '%s' has a bad link to '%s'" % (referrer, url)



# my $top = 'http://perl.plover.com/';
# my $interesting = sub { shift; grep /ˆ\Q$top/o, @_ };
# my $urls = igrep_l { not $_[1]{TYPE} } traverse($interesting, $top);
# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
#   print "Page '$referrer' has a bad link to '$url'\n";
# }
top = "http://perl.plover.com"
interesting = (lambda x,y: [_y for _y in y if top in _y])
urls = igrep_l((lambda url, head, referrer, html: not html), traverse([top], interesting))
for url, head, referrer, html in urls:
    if not html:
        continue
    print "Page '%s' has a bad link to '%s'" % (referrer, url)




# sub igrep_l (&$) {
#   my ($is_interesting, $it) = @_;
#   return Iterator {
#     while (my @vals = NEXTVAL($it)) {
#       return @vals if $is_interesting->(@vals);
#     }
#     return;
#   }
# }
def igrep_l(is_interesting, it):
    for vals in it:
        if is_interesting(*vals):
            yield vals
        

# while (my ($url, $head, $referrer) = NEXTVAL($urls)) {
#   print "Page '$referrer' has a bad link to '$url'\n";
#   print "Edit now? ";
#   my $resp = <>;
#   if ($resp =~ /ˆy/i) {
#     system $ENV{EDITOR}, url_to_filename($referrer);
#   } elsif ($resp =~ /∧ q/i) {
#     last;
#   }
# }
for url, head, referrer, html in urls:
    print "Page '%(referrer)s' has a bad line to '%(url)s'" % locals()
    print "Edit now?"
    resp = raw_input():
    if resp == 'y':
        os.system(os.environ["EDITOR"] + " " + url_to_filename(referrer))
    elif resp == 'q':
        break





# sub traverse {
#   my $interesting_link;
#   $interesting_link = shift if ref $_[0] eq 'CODE';
#   my @queue = map [$_, 'supplied by user'], @_;
#   my %seen;
#   my $q_it = igrep { ! $seen{$_->[0]}++ }
#                imap { $_->[0] =~ s/#.*$//; $_}
#                  Iterator { return shift(@queue) };
#   if ($interesting_link) {
#     $q_it = igrep {$interesting_link->(@$_)} $q_it;
#   }
#   return imap {
#       my ($url, $referrer) = @$_;
#       my (%head, $html);
#       @head{qw(TYPE LENGTH LAST_MODIFIED EXPIRES SERVER)} = head($url);
#       if ($head{TYPE} =~ m{ˆtext/html\b}) {
#         $html = get($url);
#         push @queue,
#           map [$_, $url],
#             get_links($url, $html);
#       }
#       return wantarray ? ($url, \%head, $referrer, $html) : $url;
#   } $q_it;
# }
# this is not an exact match but is close enough for our
# purposes.  what ever that purpose could be.
def traverse(queue, interesting_link=None):
    seen = {}
    queue = [(x, "supplied by user") for x in queue]

    def iterate_queue():
        while queue:
            yield queue.pop(0)

    def not_seen_yet(url):
        seen.setdefault(url,0)
        seen[url] += 1
        if seen[url] > 1:
            return False
        return True

    q_it = iterate_queue()
    q_it = ((url[0].split("#")[0], url[1]) for url in q_it)
    q_it = (url for url in q_it if not_seen_yet(url[0]))

    if interesting_link != None:
        q_it = igrep(interesting_link, q_it)

    def process_url((url, referrer)): 
        print "process_url:", url, referrer
        try:
            page = urllib2.urlopen(url)
        except urllib2.HTTPError:
            print "http error for:", url
            return url, None, referrer, None

        content_type = page.headers.getheader("content-type")
        if re.search(r"^text/html\b", content_type):
            html = page.read()
            queue.extend([(x, url) for x in get_links(url, html)])
        return url, page.headers, referrer, html
            
    return imap(process_url, q_it)
        


# sub make_robot_filter {
#   my $agent = shift;
#   my %seen_site;
#   my $rules = WWW::RobotRules->new($agent);
#   return sub {
#     my $url = url(shift());
#     return 1 unless $url->scheme eq 'http';
#     unless ($seen_site{$url->netloc}++) {
#       my $robots = $url->clone;
#       $robots->path('/robots.txt');
#       $robots->frag(undef);
#       $rules->parse($robots, get($robots));
#     }
#     $rules->allowed($url)
#   };
# }
def make_robot_filter(agent):
    seen_site = {}

    rules = {} #robotparser.RobotFileParser()

    def _filter(url):
        u = urlparse.urlparse(url) 
        if u.scheme != "http":
            return True

        if u.netloc not in rules:
            rules[u.netloc] = robotparser.RobotFileParser()
            rules[u.netloc].set_url(u.scheme+"://"+u.netloc+"/robots.txt")
            rules[u.netloc].read()

        return rules[u.netloc].can_fetch(agent, url)

    return _filter
lair of the dustbunny

Wednesday, February 25, 2009

Higher Order Perl (Python Style) : Chapter 4 - Iterators

No comments:

Blog Archive

About Me

Labels