rendered paste body# copyright 2009 h4ck3rm1k3@flossk.org
# licensed under GNU Affero General Public License
# http://www.fsf.org/licensing/licenses/agpl-3.0.html
use LWP::UserAgent;
use Compress::Bzip2 qw(:all );
use strict;
use warnings;
my $partno = shift ;
$partno = 0 unless defined $partno;
my $url = shift || 'http://download.wikimedia.org/enwiki/20090713/enwiki-20090713-pages-articles.xml.bz2';
my $filename = sprintf("wikipedia_dump_part_%0.4d_.bz2",$partno);
warn "getting $filename";
#die $filename;
sub c
{
return shift;
}
sub bunzip {
my $data = shift;
my $ret = '';
my $stream = Compress::Bzip2::decompress_init();
$ret = $stream->add($data);
$stream->finish;
return $ret;
}
my $seen="";
my $blocksize = 1500000;
my $chunksize = 150000;
warn " chunksize $chunksize \n";
our $warn = "";
$SIG{'__WARN__'} =
sub {
warn "error";
warn $_[0] ;
$warn = $_[0];
} ;
$SIG{'__DIE__'} =
sub {
warn "error2";
warn $_[0] ;
} ;
sub checkbz2
{
# close OUT;
open BZ,"bzip2recover $filename 2>&1 | ";
while (<BZ>)
{
if (/block (\d+) runs from (\d+) to (\d+)/)
{
my $block = $1;
my $from = $2;
my $to = $3;
my $size = $to-$from;
my $fromb = $from/8; # bytes
warn "Found $_";
warn "Block $block size $size";
warn "Block $block from $fromb";
}
else
{
# warn "Other $_";
}
print $_;
}
close BZ;
}
sub callback
{
# my $stream =shift;
my($data, $response, $protocol) = @_;
print OUT $data;
my $len = length($data);
$seen .= $data;
$len = length($seen);
}
my $ua = LWP::UserAgent->new;
$ua->agent("WikipediaDownload/0.1 ");
my $head = $ua->request(HTTP::Request->new('HEAD'=>$url));
die "HEAD error: ", $head->request->url, ' - ',
$head->headers_as_string, "\n"
unless $head->is_success;
my $cl = $head->content_length();
die "No content length\n" unless defined $cl;
die "content length is 0.\n" unless $cl;
print "$url\nLength on server: ", $cl, "\n";
my $stream = undef;
$stream = Compress::Bzip2::decompress_init();
# Pass request to the user agent and get a response back
# my $res = $ua->request($req);
#1473817
my $overlap = 200000;
my $startpos = ($blocksize * $partno ) - $overlap ; # starting point
$startpos = 0 if $startpos < 0;
my $endpos = $startpos + $blocksize + $overlap;
$endpos = $cl if $endpos > $cl;
printf "Requesting %s - %s\n", $startpos, $endpos - 1;
my $req = HTTP::Request->new('GET' => $url);
$req->init_header('Range' => sprintf("bytes=%s-%s",
$startpos ,
$endpos - 1
));
open OUT,">$filename";
my $response = $ua->request($req,
\&callback
# 4096
);
print "finished";
#$stream->finish;
print "\n", $url, " ", scalar(localtime), "\n\n";
close OUT;
checkbz2;