カラクリサイクル

『輝かしい青春』なんて無かった人の雑記

Plagger::Plugin::Subscription::Parse

概要: WebPageを解析してFeed URIを抽出して購読するプラグイン


Web Pageを解析してFeed購読っていうのだとSubscription::XPathとかあるわけだけど、 自由度がいまいち低いし、正規表現でも引き抜けたらいいんじゃね?ということで作ってみた。

package Plagger::Plugin::Subscription::Parse;
use strict;
use warnings;
use URI;
use Encode ();
use HTML::TreeBuilder::XPath;
use HTML::Entities ();
use Plagger::Util;
use Plagger::Tag;
use Plagger::Feed;
use base qw( Plagger::Plugin );
sub register {
    my ( $self, $c ) = @_;
    $c->register_hook(
        $self,
        'subscription.load' => $self->can('load'),
    );
}
sub load {
    my ( $self, $c ) = @_;
    my $confs = $self->conf->{'subscribe'};
       $confs = [ $confs ] if ( ref $confs ne 'ARRAY' );
    my $default = $self->conf->{'default'};
    $default->{'xpath'} ||= {
        url     => '//a/@href',
        title   => '//a/@title',
    };
    for my $conf ( @{ $confs } ) {
        $conf = { target => $conf } if ( ! ref $conf );
        my $uri = $conf->{'target'}
            or $c->log( error => "Target url is missing." ) and next;
        Encode::_utf8_off( $uri );
        for my $name ( qw( xpath regexp after_hook ) ) {
            $conf->{$name} ||= $default->{$name};
        }
        my $meta = {};
        $meta->{'link'} = $conf->{'target'};
        for my $name ( qw( title meta tag ) ) {
            $meta->{$name} = $conf->{$name} || $default->{$name};
        }
        if ( $meta->{'tag'} && ! ref $meta->{'tag'} ) {
            my $tags = [ Plagger::Tag->parse( $meta->{'tag'} ) ];
            $meta->{'tag'} = $tags;
        }
        $c->log( info => "Parse $uri" );
        my $xhtml = Plagger::Util::load_uri( URI->new( $uri ), $self );
           $xhtml = Plagger::Util::decode_content( $xhtml );
        my $title;
        if ( $xhtml =~ m{<title>\s*(.*?)\s*</title>}i ) {
            $title = HTML::Entities::decode( $1 );
        }
        $meta->{'title'} ||= $title || $meta->{'link'};
        my $datas;
        if ( defined $conf->{'xpath'} ) {
            $datas = $self->find_node( $c, $conf->{'target'}, $xhtml, $conf->{'xpath'} )
                or next;
        }
        else {
            $datas = $self->parse( $c, $conf->{'target'}, $xhtml, $conf->{'regexp'} )
                or next;
        }
        if ( $conf->{'after_hook'} ) {
            for my $data ( @{ $datas } ) {
                eval $conf->{'after_hook'};
                $c->error( $@ ) if ( $@ );
            }
        }
        for my $feed ( $self->create_feed( $c, $datas, $meta ) ) {
            $c->subscription->add( $feed );
        }
    }
}
sub find_node {
    my ( $self, $c, $url, $xhtml, $conf ) = @_;
    if ( ! $conf->{'url'} ) {
        $c->log( error => "'url' XPath is not specified. :$url" );
        return;
    }
    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->parse( $xhtml );
    $tree->eof;
    no warnings 'redefine';
    local *HTML::Element::_xml_escape = $self->can('xml_escape');
    use warnings;
    my $data = [];
    for my $capture ( keys %{ $conf } ) {
        next if ( ! $conf->{$capture} );
        my $index = 0;
        for my $child ( $tree->findnodes( $conf->{$capture} ) ) {
            my $value = ( $child->isElementNode ) ? $child->as_XML : $child->getValue ;
            $data->[$index] = {} if ( ! ref $data->[$index] );
            $data->[$index]->{$capture} = $value;
            $index++;
        }
    }
    return $data;
}
sub parse {
    my ( $self, $c, $url, $xhtml, $conf ) = @_;
    my @capture = split m{\s+}, $conf->{'capture'};
    if ( ! @capture ) {
        $c->log( error => "There is no available 'capture'. :$url" );
        return;
    }
    if ( ! $conf->{'extract'} ) {
        $c->log( error => "'extrace' is not specified. :$url" );
        return;
    }
    my $extract = Plagger::Util::decode_content( $conf->{'extract'} );
    my ( $cur_pos, $prev_pos ) = ( 0, 0 );
    my $data = [];
    my $index = 0;
    while ( 1 ) {
        if ( $xhtml =~ m{$extract}sg ) {
            $cur_pos = pos $xhtml;
            my $str = substr $xhtml, $prev_pos, length( $xhtml );
            if ( my @matched = ( $xhtml =~ m{$extract}s ) ) {
                for my $name ( @capture ) {
                    $data->[$index] = {} if ( ! ref $data->[$index] );
                    $data->[$index]->{$name} = shift @matched;
                }
                $index++;
                $prev_pos = $cur_pos;
            }
            else {
                last;
            }
        }
        else {
            last;
        }
    }
    return $data;
}
sub create_feed {
    my ( $self, $c, $datas, $meta ) = @_;
    my @feeds = ();
    for my $data ( @{ $datas } ) {
        my $feed = Plagger::Feed->new;
        $feed->url( URI->new_abs( $data->{'url'}, $meta->{'link'} ) );
        $feed->title( $data->{'title'} || $meta->{'title'} );
        $feed->link( $meta->{'link'} );
        $feed->meta( $meta->{'meta'} ) if ( $meta->{'meta'} );
        $feed->tags( $meta->{'tag'} )  if ( $meta->{'tag'}  );
        push @feeds, $feed;
    }
    return @feeds;
}
sub xml_escape {
    for my $x ( @_ ) {
        $x = Plagger::Util::encode_xml( $x );
    }
}
1;
__END__
=head1 NAME
Plagger::Plugin::Subscription::Parse - Use XPath or Regexp to extract subscriptions from web pages
=head1 SYNOPSIS
  - module: Subscription::Parse
    config:
      subscribe:
        - target: http://d.hatena.ne.jp/antipop/20050628/1119966355
          xpath:
            url: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@href
            title: //ul[@class="xoxo" or @class="subscriptionlist"]//a/@title
=head1 DESCRIPTION
This plugin extracets subscriptions to XHTML content,
using XPath or Regular expression to find links.
=head1 CONFIG
=head2 subscribe
=over 7
=item target
URI of the analyzed web page
=item xpath
XPath expression of each data is specified.
=over 3
=item url
XPath expression which extracts subscriptions URI out
=item title
XPath expression which extracts subscriptions title out
=item other name
When specified XPath expression besides the name of two above,
it's stocked in C<$data->{'name'}>.
=back
=item regexp
Regular expression to extract each data out.
=over 2
=item extract
Regular expression to extract data out
=item capture
The name of text that matches to the regular expression is specified.
It is delimited in space.
Data is stocked C<$data->{$name}>.
=back
=item after_hook
Perl Code to process the acquired data is specified.
=item title
The title is specified.
=item meta
The meta data is specified.
=item tag
tags are specified.
=back
=head2 default
Value of default when there is no individual specification
=over 5
=item xpath
=item regexp
=item after_hook
=item meta
=item tag
=back
=head1 AUTHOR
Naoki Okamura (Nyarla,) E<lt>thotep@nyarla.netE<gt>
=head1 LICENSE
This Plug-in is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
=head1 SEE ALSO
L<Plagger>
=cut

使い方はこんな感じ。細かい設定に関してはPODを参照。 機械翻訳なんで若干英文が怪しいかも。

plugins:
  - module: Subscription::Parse
    config:
      subscribe:
        - target: http://localhost/list.html
          xpath:
            url: //ul/li/a/@href
            title //ul/li/a/@title

CutomFeed::ConfigとかFilter::EntryFullText、Subscription::Config辺りを参考にして作った。

Web Pageを解析してURI引っこ抜いて購読するっていうのはこれで大抵まかなえると思う。 XPathと正規表現が使えるし、after_hookで引っこ抜いた内容をいじるってこともできるので、 結構柔軟に色々できるはず。

ちなみにplugin-start.pl使って作ったのでテストとかもちゃんと書いた。 書いたんだけど、エントリに貼り付けると相当長くなりそうなので省略しました。 あと、コミットできる状態だけど、本家のリポジトリにコミットする勇気がない。

まあ、テストとかも公開したいんだけど、公開できるリポジトリがないので微妙。 Google Code Project Hostingとかあるけど、あれはあれで微妙なところがあるので、 なんとかならないかな、という感じ。

今まで作ったプラグインに関してはまだテストとか書いてないので、これから書く。

追記

decode_contentしてなかったのを修正。