use strict; use LWP::UserAgent; my ($local,$s,@l,%tags,%temp,@temp,$sub, $ua, @status,$file,$outfile); $file='c:/perl/xml/config.xml' unless $file=shift @ARGV; my $lurl; #local url useful for testing my $debug=1; # 2 gets hash names my $headers=0; # debug option to print out the headers my @dontwant; # items in here are not processed my $useragent="BangkokPerl/1.0"; my $debuglog="debug.log"; open LOG,">$debuglog" if $debug; { my @page=parseconfig($file,"page"); my $page=$page[0]; $_=$page->{'options'}; #set options if (/debug=(\d)/) { $debug=$1; } print "debug $debug\n"; if (/lurl/) { $lurl=1; } if (/headers/) { $headers=1; } $_=$page->{'dontwant'}; if ($_) {@dontwant=split;} @temp=parseconfig($file,'item'); #get the items my $outfile = $page->{'outfile'}; if ($outfile) { open OUT, ">$outfile" or die "died opening $file"; select OUT; } $page->{'head'}=~ s/^$/$1/s; #clean off any comments print eval($page->{'head'}),"\n"; #print the header $page->{'body'}=~ s/^$/$1/s; #start off the body print eval($page->{'body'}),"\n"; foreach (@temp) { getdata($_); } #process each $page->{'footer'}=~ s/^$/$1/s; print eval($page->{'footer'}),"\n"; #print footer exit; } sub getdata { my ($k,$v,@l,%temp); my $temp=shift or die "no data"; foreach (@dontwant) { return if $_ eq $temp->{'name'}; } if ($lurl) {$temp->{'fetch'}=$temp->{'lurl'}; } else {$temp->{'fetch'}=$temp->{'url'}; } $temp->{'sub'}=~ s/^$/$1/s; #clean off any comments if (!$temp->{'method'} or$temp->{'method'} eq 'chunk') { print retrievechunk($temp),"\n"; } elsif ($temp->{'method'} eq 'all') { print retrieveall($temp),"\n"; } elsif($temp->{'method'}=~ m/file/) { $temp->{'file'}=$temp->{'method'}; $temp->{'file'}=~ s/^file=(.+)/$1/; print retrievetofile($temp),"\n"; } 0; } sub retrievetofile { my $temp=shift or die "no data"; my $url=$temp->{'fetch'}; my $sub=$temp->{'sub'}; my $file=$temp->{'file'}; print STDERR "$temp->{'name'} $temp->{'method'}\n" if $debug; $ua = new LWP::UserAgent if not $ua; $ua->agent($useragent); $ua->env_proxy; #let environment set proxy $s=""; my $req = new HTTP::Request 'GET' => $url; my $res = $ua->request($req,$file); print LOG "$temp->{'name'} Status: " . $res->status_line . "\n"; print STDERR "$temp->{'name'} $res\n"; if ($res->is_success) { $_=$res->content; return eval($sub); } else { print STDERR "Error: $temp->{'name'}" . $res->status_line . " $url\n"; return "Error: $temp->{'name'}" . $res->status_line . " $url\n"; } } sub retrieveall { # good for short ones my $temp=shift or die "no data"; my $url=$temp->{'fetch'}; my $sub=$temp->{'sub'}; print STDERR "$temp->{'name'} $temp->{'method'} $url\n" if $debug; $ua = new LWP::UserAgent if not $ua; $ua->agent($useragent); # $ua->env_proxy; #let environment set proxy $s=""; my $res=$ua->request(HTTP::Request->new('GET', $url)); if ($headers) { showheaders($temp,$res) } else { print LOG "$temp->{'name'} Status: " . $res->status_line . "\n"; } if ($res->is_success) { $_=$res->content; return eval($sub); } else { print STDERR "Error: $temp->{'name'}" . $res->status_line . " $url\n"; return "Error: $temp->{'name'}" . $res->status_line . " $url\n"; } } sub retrievechunk { #default, good if don't need whole thing my $temp=shift or die "no data"; my $url=$temp->{'fetch'}; my $sub=$temp->{'sub'}; my $head=$headers; my $method="CHUNK"; print STDERR "$temp->{'name'} $method $url\n" if $debug; $ua = new LWP::UserAgent if not $ua; $ua->agent($useragent); # $ua->env_proxy; #let environment set proxy my ($expected_length, $bytes_received); my $lastchunk=""; $s="no data"; my $res=$ua->request(HTTP::Request->new('GET', $url), sub { my($chunk, $res) = @_; if ($head) { showheaders($temp,$res); $head=0;} else { print LOG "$temp->{'name'} Status: " . $res->status_line . "\n"; } $bytes_received += length($chunk); unless (defined $expected_length) { $expected_length = $res->content_length || 0; } if ($expected_length) { printf STDERR "%d%% - ",100 * $bytes_received / $expected_length if $debug==2; } print STDERR "$bytes_received bytes received\n" if $debug; $_=$lastchunk=$lastchunk . $chunk; #our data can get chopped! die if ($s=eval($sub)); # $s; },1024); if (!$res->is_success) { print STDERR "$temp->{'name'} Error: " . $res->status_line . " $url\n"; $s= "$temp->{'name'} Error: " . $res->status_line . " $url\n"; } return $s; } sub showheaders { #show the headers my ($temp,$res) =@_; print LOG "########$temp->{'name'} $temp->{'method'}########\n"; my $hd=$res->headers; while (my ($k,$v) = each %$hd) { print LOG "$k\t$v\n"; if ( $k eq 'content_type' and $v=~/ARRAY/) { foreach ($v) { print $_,"\n"} } } } sub parseconfig { my ($s,$k,$v,@l,%tags,%temp,@temp,$element); my $file = shift or die "no filename for parseconfig"; # we need a filename # parse an XML config file - assumes a DTD like this # $element="item" if not $element=shift; # change name if you want # we return a list of anonymous hashes, each with the above keys (without the ?s) # can add additional fields to the DTD, parser doesn't care # note this won't handle nested items { local $/= undef; open CONFIG,"$file"; $s= ; close CONFIG; } if ($s=~ m/\\s*(.*?)\\s*(.*)"; } } else { die "bad DTD $file"; } $_=$s; @l=m#<$element>(.+?)#sg; #get a list of elements foreach $s (@l) { my $temp={}; while (($k,$v) = each %tags) { $s=~ m/$v/s; $temp->{$k}=$2 if length $2; } push @temp,$temp; #anonymous hash to list } return @temp if ($debug < 3); print "-----keys------\n"; my $temp={}; for $temp (@temp) { while (($k,$v) = each %$temp) { print "$k\t$v\n"; } } exit; }