All pastes #1505604 Raw Edit

Wikipedia part parser

public text v1 · immutable
#1505604 ·published 2009-07-24 10:18 UTC
rendered paste body
# copyright 2009 h4ck3rm1k3@flossk.org
# licensed under GNU Affero General Public License
# http://www.fsf.org/licensing/licenses/agpl-3.0.html
use LWP::UserAgent;
use Compress::Bzip2 qw(:all );
use strict;
use warnings;


my $partno = shift ;
$partno = 0 unless defined $partno;

my $url = shift || 'http://download.wikimedia.org/enwiki/20090713/enwiki-20090713-pages-articles.xml.bz2';
my $filename = sprintf("wikipedia_dump_part_%0.4d_.bz2",$partno);
warn "getting $filename";
#die $filename;

sub c 
{
    return shift;
}

sub bunzip {
    my $data = shift;
    my $ret = '';   
    my $stream = Compress::Bzip2::decompress_init();
    $ret = $stream->add($data);
    $stream->finish;    
    return $ret;
}

my $seen="";
my $blocksize = 1500000;
my $chunksize = 150000;

warn " chunksize $chunksize \n";
our $warn = "";
$SIG{'__WARN__'} = 
    sub { 
	warn "error";
	warn $_[0]  ;
	$warn = $_[0];
} ;

$SIG{'__DIE__'} = 
    sub { 
	warn "error2";
	warn $_[0]  ;
} ;

sub checkbz2
{

#	close OUT;
	open BZ,"bzip2recover $filename 2>&1 | ";
	while (<BZ>)
	{
	    if (/block (\d+) runs from (\d+) to (\d+)/)
	    {
		my $block = $1;
		my $from = $2;
		my $to   = $3;
		my $size = $to-$from;
		my $fromb = $from/8; # bytes
		warn "Found $_";
		warn "Block $block size $size";
		warn "Block $block from $fromb";
	    }
	    else
	    {
#		warn "Other $_";
	    }
	    print $_;
	}
	close BZ;

}

sub callback
{
#    my $stream =shift;
    my($data, $response, $protocol) = @_; 
    print OUT $data;
    my $len = length($data);
    $seen .= $data;
    $len = length($seen);
}

my  $ua = LWP::UserAgent->new;
  $ua->agent("WikipediaDownload/0.1 ");

   my $head = $ua->request(HTTP::Request->new('HEAD'=>$url));

   die "HEAD error: ", $head->request->url, ' - ',
     $head->headers_as_string, "\n"
    unless $head->is_success;


   my $cl = $head->content_length();
   die "No content length\n" unless defined $cl;
   die "content length is 0.\n" unless $cl;
   print "$url\nLength on server: ", $cl, "\n";


my $stream = undef;
$stream = Compress::Bzip2::decompress_init();

  # Pass request to the user agent and get a response back
#  my $res = $ua->request($req);
                #1473817
my $overlap = 200000;
my $startpos =  ($blocksize * $partno ) -  $overlap ; # starting point
$startpos = 0 if $startpos < 0;

my $endpos   =  $startpos + $blocksize + $overlap;

$endpos = $cl if $endpos > $cl;
 
printf "Requesting %s - %s\n", $startpos, $endpos - 1;
my $req = HTTP::Request->new('GET' => $url);
$req->init_header('Range' => sprintf("bytes=%s-%s",
				     $startpos ,
				     $endpos - 1
		  ));

open OUT,">$filename";


   my $response = $ua->request($req,
			       \&callback 
#			       4096
    );

print "finished";

#$stream->finish;
print "\n", $url, " ", scalar(localtime), "\n\n";

close OUT;

checkbz2;