If you connect to the IRC server on irc.freenode.net and run /join #pm2irc, you'll see the pm2irc bot sitting there. Every ten minutes, it gets the newest questions from perlmonks.com and if there's new ones, it posts their topic and URL to the IRC channel. If you run a graphical IRC client like Gaim, all you need is click on the link to open up a browser showing the question page. Handy for XP-hungry monks.
Here's the script that does it. I'll let it run for a while to collect feedback, let me know if you find it useful. Ideally, we wouldn't have to scrape the site but have a process on perlmonks funnelling the realtime data into an IRC channel, wouldn't that be great?
#!/usr/bin/perl -w
###########################################
# pm2irc - Scrape perlmonks.com and post
# new questions to #pm2irc
###########################################
use strict;
use Bot::BasicBot;
use HTML::TreeBuilder;
use URI::URL;
use CGI qw(a);
use Cache::FileCache;
use HTTP::Request::Common;
use Log::Log4perl qw(:easy);
use POE;
use POE::Component::Client::HTTP;
our $CHANNEL = "#pm2irc";
our $USER = "pm2irc";
our $FETCH_INTERVAL = 600;
our $FETCH_URL = "http://perlmonks.com/" .
"?node=Newest%20Nodes";
Log::Log4perl->easy_init($INFO);
our $cache = new Cache::FileCache({
namespace => "pm2irc",
});
my $Bot = Bot::BasicBot->new(
server => 'irc.freenode.net',
channels => [$CHANNEL],
nick => $USER,
);
DEBUG "Setting up pm2irc POE components";
POE::Component::Client::HTTP->spawn(
Alias => "ua",
Timeout => 60,
);
POE::Session->create(
inline_states => {
_start => sub {
# Wait 20 secs before the first post
$poe_kernel->delay('http_start', 20);
},
http_start => sub {
DEBUG "Fetching url $FETCH_URL";
$poe_kernel->post("ua", "request",
"http_ready", GET $FETCH_URL);
$poe_kernel->delay('http_start',
$FETCH_INTERVAL);
},
http_ready => sub {
DEBUG "http_ready $FETCH_URL";
my $resp= $_[ARG1]->[0];
if($resp->is_success()) {
pm_update($resp->content());
} else {
ERROR "Can't fetch $FETCH_URL: ", $resp->message();
}
},
}
);
DEBUG "The dance begins ...";
$Bot->run();
###########################################
sub pm_update {
###########################################
my($html_text) = @_;
if(my @nws = latest_news($html_text)) {
for(@nws) {
INFO "Sending '$_' to channel";
$Bot->say(channel => $CHANNEL,
body => "$_",
);
}
}
}
###########################################
sub latest_news {
###########################################
my($html_string) = @_;
my $start_url =
URI::URL->new($FETCH_URL);
my $max_node;
my $saved = $cache->get("max-node");
$saved = 0 unless defined $saved;
my @aimtext = ();
for my $entry (@{qparse($html_string)}) {
my($text, $url) = @$entry;
my($node) = $url =~ /(\d+)$/;
if($node > $saved) {
INFO "New node $text ($url)";
push @aimtext, "$text $url";
}
$max_node = $node if
!defined $max_node or
$max_node < $node;
}
$cache->set("max-node", $max_node)
if $saved < $max_node;
return @aimtext;
}
###########################################
sub qparse {
###########################################
my($html_string) = @_;
my $start_url =
URI::URL->new($FETCH_URL);
my @questions = ();
my $parser = HTML::TreeBuilder->new();
my $tree = $parser->parse($html_string);
my($questions) = $tree->look_down(
"_tag", "a",
"name", "toc-Questions");
if(! $questions) {
ERROR "Couldn't find Questions section";
return undef;
}
my $node = $questions->parent();
while($node->tag() ne "table") {
$node = $node->right();
}
for my $tr ($node->look_down(
"_tag", "tr")) {
for my $a ($tr->look_down(
"_tag", "a")) {
my $href = $a->attr('href');
my $text = $a->as_text();
my $url = URI::URL->new($href,
$start_url);
push @questions,
[$text, $url->abs()];
# Process only the question
# node, not the author's node
last;
}
}
$tree->delete();
return \@questions;
}
Is there a particular reason you query against the Newest Nodes page, rather than the available XML tickers' Newest Nodes XML ticker, which would likely be easier to parse and digest for the information you want (such as looking for entries with a nodetype that is not a "user", "note", or "poll")? Just curious....
C.
Please do have a look at:
node 72241
PerlMonks::Mechanized (beta)
I believe the information and code there will help you achieve your goal more easily and without bandwidth-hogging scraping. :-)
HTH,
#!/usr/bin/perl -w
###########################################
# pm2irc - Scrape perlmonks.com and post
# new questions to #pmnewestnodes
###########################################
use strict;
use Bot::BasicBot;
use HTML::TreeBuilder;
use URI::URL;
use CGI qw(a);
use Cache::FileCache;
use HTTP::Request::Common;
use Log::Log4perl qw(:easy);
use POE;
use POE::Component::Client::HTTP;
use XML::Simple;
our $CHANNEL = "#pm2irc";
our $USER = "pm2irc";
our $FETCH_INTERVAL = 600;
our $BASE_URL = "http://perlmonks.com/?node_id=";
# RSS feed
our $FETCH_URL = "${BASE_URL}30175";
Log::Log4perl->easy_init($INFO);
our $cache = new Cache::FileCache({
namespace => "pm2irc",
});
my $Bot = Bot::BasicBot->new(
server => 'irc.freenode.net',
channels => [$CHANNEL],
nick => $USER,
);
DEBUG "Setting up pm2irc POE components";
POE::Component::Client::HTTP->spawn(
Alias => "ua",
Timeout => 60,
);
POE::Session->create(
inline_states => {
_start => sub {
# Wait 20 secs before the first post
$poe_kernel->delay('http_start', 20);
},
http_start => sub {
DEBUG "Fetching url $FETCH_URL";
$poe_kernel->post("ua", "request",
"http_ready", GET $FETCH_URL);
$poe_kernel->delay('http_start',
$FETCH_INTERVAL);
},
http_ready => sub {
DEBUG "http_ready $FETCH_URL";
my $resp= $_[ARG1]->[0];
if($resp->is_success()) {
pm_update($resp->content());
} else {
ERROR "Can't fetch $FETCH_URL: ", $resp->message();
}
},
}
);
DEBUG "The dance begins ...";
$Bot->run();
###########################################
sub pm_update {
###########################################
my($html_text) = @_;
if(my @nws = latest_news($html_text)) {
for(@nws) {
INFO "Sending '$_' to channel";
$Bot->say(channel => $CHANNEL,
body => "$_",
);
}
}
}
###########################################
sub latest_news {
###########################################
my($xml_string) = @_;
my $max_node;
my $saved = $cache->get("max-node");
$saved = 0 unless defined $saved;
my @aimtext = ();
my $data = XMLin($xml_string);
for my $node (@{ $data->{NODE} }) {
next unless $node->{nodetype};
next unless $node->{nodetype} =~
/perlquestion|monkdiscussion/;
$node->{content} =~ s/\n//g;
if($node->{node_id} > $saved) {
INFO "New node $node->{content} ($node->{node_id})";
unshift @aimtext, "$node->{content} " .
"${BASE_URL}$node->{node_id}";
}
$max_node = $node if !defined $max_node or
$max_node < $node->{node_id};
}
$cache->set("max-node", $max_node)
if $saved < $max_node;
return @aimtext;
}
couple of notes.
in the new XML version, you no longer need to "use [cpan://HTML::TreeBuilder]" - i know this because i had a devil of a time installing that from CPAN, in fact i gave up! (actually, i tried again just now and it installed seemlessly. *shrug*)
second, i was inspired to lift most of your code and adjust it to my needs. i tried [cpan://Net::YahooMessenger] and had issues - i don't think they've kept up with YIM's protocol and/or server updates. so i resorted to [cpan://Mac::AppleScript] to create an applescript and send myself an IM that way, and decided to post the resulting code here :)
#!/usr/bin/perl -w
use Mac::AppleScript qw(RunAppleScript);
use POE;
use POE::Component::Client::HTTP;
use URI::URL;
use CGI qw(a);
use Cache::FileCache;
use HTTP::Request::Common;
use Log::Log4perl qw(:easy);
use XML::Simple;
our $BASE_URL = "http://perlmonks.org/?node_id=";
our $FETCH_URL = "${BASE_URL}30175";
$FROM_ACCOUNT = "m0nkb0t";
$TO_ACCOUNT = "m0nk22";
Log::Log4perl->easy_init({level=>$DEBUG,file=>">>apl.log"});
our $cache = new Cache::FileCache({
namespace => "pm2yim",
});
DEBUG "setting up pm2yim POE components";
POE::Component::Client::HTTP->spawn(
Alias => "ua",
Timeout => 60,
);
POE::Session->create(
inline_states => {
_start => sub {
# Wait 20 secs before the first post
$poe_kernel->delay('http_start', 20);
},
http_start => sub {
DEBUG "Fetching url $FETCH_URL";
$poe_kernel->post("ua", "request", "http_ready", GET $FETCH_URL);
$poe_kernel->delay('http_start', $FETCH_INTERVAL);
},
http_ready => sub {
DEBUG "http_ready $FETCH_URL";
my $resp= $_[ARG1]->[0];
if($resp->is_success()) {
pm_update($resp->content());
} else {
ERROR "Can't fetch $FETCH_URL: ", $resp->message();
}
},
}
);
DEBUG "The dance begins ...";
$poe_kernel->run();
sub pm_update {
my($html_text) = @_;
if(my @nws = latest_news($html_text)) {
for(@nws) {
INFO "Sending '$_' to IM";
im_update( $_ );
}
}
}
sub im_update {
my $scr;
my $msg = shift;
DEBUG "Attempting to update via IM";
($scr = <<" EOS") =~ s/^\s+//gm;
set fromAcct to "$FROM_ACCOUNT"
set toAcct to "$TO_ACCOUNT"
tell application "Adium"
repeat with acc in accounts
if display name of acc is fromAcct then
connect acc
repeat with ctc in contacts of acc
if display name of ctc is toAcct then
send ctc message "$msg" on account acc
end if
end repeat
disconnect acc
end if
end repeat
end tell
EOS
RunAppleScript($scr) or DEBUG "couldn't run applescript";
}
sub latest_news {
my($xml_string) = @_;
my $max_node;
my $saved = $cache->get("max-node");
$saved = 0 unless defined $saved;
my @aimtext = ();
my $data = XMLin($xml_string);
for my $node (@{ $data->{NODE} }) {
next unless $node->{nodetype};
next unless $node->{nodetype} =~
/perlquestion|monkdiscussion/;
$node->{content} =~ s/\n//g;
if($node->{node_id} > $saved) {
INFO "New node $node->{content} ($node->{node_id})";
unshift @aimtext, "$node->{content} " .
"${BASE_URL}$node->{node_id}";
}
$max_node = $node->{node_id} if !defined $max_node or
$max_node < $node->{node_id};
}
$cache->set("max-node", $max_node)
if $saved < $max_node;
return @aimtext;
}
granted, the target audience for these particular changes is EXTREMELY small, but in case anyone is interested in watching [id://3628] and having an IM sent to their yahoo account via an Adium applescript, well, there it is :)
$/ = q#(\w)# ; sub sig { print scalar reverse join ' ', @_ } sig
map { s$\$/\$/$\$2\$1$g && $_ } split( ' ', ",erckha rlPe erthnoa stJu" );
perlmonks.org content © perlmonks.org and atcroft, castaway, chargrill, planetscape, saintmike
prlmnks.org © 2006 edmund von der burg (eccles & toad)
v 0.03