Любимая функция для получения контента
Таскаю за собой по всем модулям и скриптам.
sub getURL {
my $params = shift;croak q[HASH needed as param] if ref ($params) ne 'HASH';
croak q[HREF needed in HASH] if (!exists ($params->{href}));
$params->{href} = q[http://].$params->{href} if $params->{href} !~ /^http (s)?:\/\//;
$params->{headers} = 0 if !$params->{headers};
$params->{ag} = qq ($user_agents[rand(($#user_agents+1))]) if !exists $params->{ag};
$params->{timeout} = 20 if !exists $params->{timeout};
$params->{content_length} = 600000000 if !exists $params->{content_length};my $result = 'false';
open my $oldout, «>&STDOUT» or die «Can't dup STDOUT: $!»;close STDOUT;
open STDOUT, «> /dev/null»;
my $retcode;my $curl;
{
$curl = new WWW::Curl::Easy;$curl->setopt (CURLOPT_URL, $params->{href});
$curl->setopt (CURLOPT_INTERFACE, $params->{ip}) if exists $params->{ip};
$curl->setopt (CURLOPT_CONNECTTIMEOUT,$params->{timeout});
$curl->setopt (CURLOPT_TIMEOUT,$params->{timeout});
$curl->setopt (CURLOPT_NOPROGRESS, 0);
$curl->setopt (CURLOPT_PROGRESSFUNCTION, sub {$_[2]>$params->{content_length}?1:0});
$curl->setopt (CURLOPT_USERAGENT,$params->{ag});
$curl->setopt (CURLOPT_POST,1) if $params->{post};
$curl->setopt (CURLOPT_PROXY,$params->{proxy}) if $params->{proxy};
$curl->setopt (CURLOPT_POSTFIELDS,$params->{post}) if $params->{post};
$curl->setopt (CURLOPT_HEADER,$params->{headers});
$curl->setopt (CURLOPT_COOKIE,$params->{cookie_string}) if $params->{cookie_string};
$curl->setopt (CURLOPT_COOKIEJAR, $params->{cookie_file}) if $params->{cookie_file};
$curl->setopt (CURLOPT_COOKIEFILE, $params->{cookie_file}) if $params->{cookie_file};
$curl->setopt (CURLOPT_FOLLOWLOCATION,1);
$curl->setopt (CURLOPT_REFERER,$params->{referer}) if exists $params->{referer};
open (my $tmp_for_curl, ">", \$result);
$curl->setopt (CURLOPT_FILE,$tmp_for_curl);
$retcode = $curl->perform;
}close STDOUT;
open STDOUT, «>&», $oldout or die «Can't dup \$oldout: $!»;return {
content => $result,
retcode => $retcode,
http_code => $curl->getinfo (CURLINFO_HTTP_CODE),
effective_url => $curl->getinfo (CURLINFO_EFFECTIVE_URL),
content_type => $curl->getinfo (CURLINFO_CONTENT_TYPE)
};
}
LWP::UserAgent короче :)
Только попродвинутее :)
[...] С использованием Jum::Web::Crawler — функцию под него я уже постил [...]