
mixiの画像は、ページ表示時にサムネールの一時URLと、画像表示用ページのpermalinkを 返してくれる。画像表示用ページ(show_*_picture.pl)にアクセスすると、これまた画像の一時URLを返してくれる。

package WWW::Mixi::Scraper::Plugin::ShowDiaryPicture;

use strict;
use warnings;
use WWW::Mixi::Scraper::Plugin;

validator {qw(
  id        is_number
  owner_id  is_number
  number    is_number

sub scrape {
  my ($self, $html) = @_;

  $scraper = scraper {
    process 'img',
      link => '@SRC';
    result qw ( link nil );

  my $stash = $self->post_process($scraper->scrape(\$html))->[0];
  if ($stash->{link}) {
      $stash->{image} = $self->{mech}->get_content($stash->{link});
  return $stash;


Pluginの作り方は、Plugin.pmに書いてある。qw がなんだかわからなかった(調べろ→自分)んだけど、post_process() は配列を求めているのに、qw の中身が一つだと配列にならないらしくて怒られたので、ダミーを入れてみた。
package WWW::Mixi::Scraper::Plugin::ShowBbsPicture;

use strict;
use warnings;
use WWW::Mixi::Scraper::Plugin;

validator {qw(
  id        is_number
  comm_id  is_number
  number    is_number

sub scrape {
  my ($self, $html) = @_;

  $scraper = scraper {
    process 'img',
      link => '@SRC';
    result qw ( link nil );

  my $stash = $self->post_process($scraper->scrape(\$html))->[0];
  if ($stash->{link}) {
      $stash->{image} = $self->{mech}->get_content($stash->{link});
  return $stash;


package WWW::Mixi::Scraper::Plugin::ShowBbsCommentPicture;

use strict;
use warnings;
use WWW::Mixi::Scraper::Plugin;

validator {qw(
  id       is_number
  bbs_id   is_number
  comm_id  is_number
  number   is_number

sub scrape {
  my ($self, $html) = @_;

  $scraper = scraper {
    process 'img',
      link => '@SRC';
    result qw ( link nil );

  my $stash = $self->post_process($scraper->scrape(\$html))->[0];
  if ($stash->{link}) {
      $stash->{image} = $self->{mech}->get_content($stash->{link});
  return $stash;


validator のところ(getパラメータ)以外はまったく同じなので、もうちょっとなんとかしたいところだけど、perlのこの辺の仕組みがわからないのでなんともならなかった。
my $pic = $mixi->parse("/show_diary_picture.pl?owner_id=$oid&id=$id&number=$number");
my $pic = $mixi->show_diary_picture->parse( owner_id => $oid, id => $id, number => $number );
長いので、全文ではなくhttp://svn.bulknews.net/repos/plagger/trunk/ との差分で。差分を取ったときのリビジョンは2063。(たぶん、ベースにしたバージョンがもっと古いので、最新のものより機能足りないっぽい。最新取り直してやり直した方が良いかなあ。)
Index: MixiScraper.pm
--- MixiScraper.pm	(リビジョン 2063)
+++ MixiScraper.pm	(作業コピー)
@@ -74,7 +74,6 @@
       email => $self->conf->{email},
       password => $self->conf->{password},
       cookie_jar => $cookie_jar,
-      mode => $self->conf->{mode},
     my $feed = Plagger::Feed->new;
@@ -86,15 +85,13 @@
     my($self, $context, $args) = @_;
     for my $type (@{$self->conf->{feed_type} || ['FriendDiary']}) {
         $context->error("$type not found") unless $MAP->{$type};
-        if ($type eq 'BBS' and $self->conf->{split_bbs_feed}) {
-            $self->aggregate_bbs_feed($context, $type, $args);
-        }
-        else {
-            $self->aggregate_feed($context, $type, $args);
-        }
+	if ($type eq 'BBS') {
+	    $self->aggregate_feed_bbs($context, $type, $args);
+	} else {
+	    $self->aggregate_feed($context, $type, $args);
+	}
 sub aggregate_feed {
     my($self, $context, $type, $args) = @_;
@@ -102,6 +99,8 @@
+    my $format = DateTime::Format::Strptime->new(pattern => '%Y-%m-%d %H:%M');
     my $meth = $MAP->{$type}->{get_list};
     my @msgs = $self->{mixi}->$meth->parse;
     my $items = $self->conf->{fetch_items} || 20;
@@ -110,148 +109,274 @@
     my $i = 0;
-    $self->{blocked} = 0;
+    my $blocked = 0;
     for my $msg (@msgs) {
         next if $type eq 'FriendDiary' and $msg->{link}->query_param('url'); # external blog
         last if $i++ >= $items;
-        $self->add_entry( $context, $type, $feed, $msg );
+        my $entry = Plagger::Entry->new;
+        $entry->title($msg->{subject});
+        $entry->link($msg->{link});
+        $entry->author($msg->{name});
+        $entry->date( Plagger::Date->parse($format, $msg->{time}) );
+	if ($entry->date) {
+	    $entry->date->set_time_zone('Asia/Tokyo');
+	}
+        if ($self->conf->{show_icon} && !$blocked && defined $MAP->{$type}->{icon}) {
+            my $owner_id = $msg->{link}->query_param($MAP->{$type}->{icon});
+            $context->log(info => "Fetch icon of id=$owner_id");
+            my $item = $self->cache->get_callback(
+                "outline-$owner_id",
+                sub {
+                    Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
+                    my $item = $self->{mixi}->show_friend->parse(id => $owner_id)->{outline};
+                    $item;
+                },
+                '12 hours',
+            );
+            if ($item && $item->{image} !~ /no_photo/) {
+                # prefer smaller image
+                my $image = $item->{image};
+                   $image =~ s/\.jpg$/s.jpg/;
+                $entry->icon({
+                    title => $item->{name},
+                    url   => $image,
+                    link  => $item->{link},
+                });
+            }
+        }
+        my @comments;
+        if ($self->conf->{fetch_body} && !$blocked && $msg->{link} =~ /view_/ && defined $MAP->{$type}->{get_detail}) {
+            # view_enquete is not implemented and probably
+            # won't be implemented as it seems redirected to
+            # reply_enquete
+            next if $msg->{link} =~ /view_enquete/;
+            $context->log(info => "Fetch body from $msg->{link}");
+            my $item = $self->cache->get_callback(
+                "item-".$msg->{link},
+                sub {
+                    Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
+                    my $item = $self->{mixi}->parse($msg->{link});
+                    $item;
+                },
+                '12 hours',
+            );
+            if ($item) {
+                my $body = $item->{description};
+                   $body =~ s!(\r\n?|\n)!<br />!g;
+                for my $image (@{ $item->{images} || [] }) {
+		    my $imagelink = $image->{link};
+		    $imagelink =~ s!.*(show_.*?_picture\.pl.*?)'.*!http://mixi.jp/$1!;
+		    if ( $self->conf->{fetch_image} ) {
+			my $pic = $self->{mixi}->parse($imagelink);
+			if ($pic->{image}) {
+			    my $filename = $pic->{link};
+			    $filename =~ s!.*/!!;
+			    open(OUT, ">" . $self->conf->{fetch_image_path} . "/images/mixi/diary/" . $filename);
+			    print OUT $pic->{image};
+			    close(OUT);
+			    my $imagepath = "/images/mixi/diary/$filename";
+			    $body .= qq(<a href="$imagelink"><img src="$imagepath" /></a>);
+			} else {
+			    $body .= qq(<a href="$imagelink"><object type="text/html" data="$imagelink" >$imagelink</object></a>);
+			}
+		    } else {
+			$body .= qq(<a href="$imagelink"><object type="text/html" data="$imagelink" >$imagelink</object></a>);
+		    }
+                }
+                $entry->body($body);
+                $entry->date( Plagger::Date->parse($format, $item->{time}) );
+		if ($entry->date) {
+		    $entry->date->set_time_zone('Asia/Tokyo');
+		}
+                if ($self->conf->{fetch_comment}) {
+                  for my $comment (@{ $item->{comments} || [] }) {
+                      my $c = Plagger::Entry->new;
+                         $c->title($entry->title . ': '. $comment->{subject});
+                         $c->body($comment->{description});
+                         $c->link($comment->{link});
+                         $c->author($comment->{name});
+                         $c->date( Plagger::Date->parse($format, $comment->{time}) );
+		      if ($c->date) {
+			  $c->date->set_time_zone('Asia/Tokyo');
+		      }
+                      push @comments, $c;
+                  }
+                }
+            } else {
+                $context->log(warn => "Fetch body failed. You might be blocked?");
+                $blocked++;
+            }
+        }
+        $feed->add_entry($entry);
+        for my $comment ( @comments ) {
+            $feed->add_entry($comment);
+        }
-sub aggregate_bbs_feed {
+sub aggregate_feed_bbs {
     my($self, $context, $type, $args) = @_;
+    my $format = DateTime::Format::Strptime->new(pattern => '%Y-%m-%d %H:%M');
     my $meth = $MAP->{$type}->{get_list};
     my @msgs = $self->{mixi}->$meth->parse;
     my $items = $self->conf->{fetch_items} || 20;
     $self->log(info => 'fetch ' . scalar(@msgs) . ' entries');
     my $i = 0;
-    $self->{blocked} = 0;
+    my $blocked = 0;
     for my $msg (@msgs) {
-        next if $type eq 'FriendDiary' and $msg->{link}->query_param('url'); # external blog
         last if $i++ >= $items;
-        my $feed = Plagger::Feed->new;
-        $feed->type('mixi');
-        (my $subject = $msg->{subject}) =~ s/\(\d+\)$//;
-        (my $link = $msg->{link}) =~ s/&comment_count=\d*//;
-        $feed->title($subject);
-        $feed->description($MAP->{$type}->{title}.': '.$msg->{name});
-        $feed->link($link);
+	my $feed = Plagger::Feed->new;
+	$feed->type('mixi');
+	my $subject = $msg->{subject};
+	$subject =~ s/\([0-9]*\)$//;
+	$feed->title($subject);
+	$feed->description('Mixi: ' . $msg->{name});
+	my $link = $msg->{link};
+	$link =~ s/&comment_count=[0-9]*//;
+	$feed->link($link);
-        $self->add_entry( $context, $type, $feed, $msg );
+        my $entry = Plagger::Entry->new;
+        $entry->title($subject);
+        $entry->link($link);
+        $entry->author($msg->{name});
+        $entry->date( Plagger::Date->parse($format, $msg->{time}) );
+	if ($entry->date) {
+	    $entry->date->set_time_zone('Asia/Tokyo');
+	}
+        if ($self->conf->{show_icon} && !$blocked && defined $MAP->{$type}->{icon}) {
+            my $owner_id = $msg->{link}->query_param($MAP->{$type}->{icon});
+            $context->log(info => "Fetch icon of id=$owner_id");
-        $context->update->add($feed);
-    }
-my $format = DateTime::Format::Strptime->new(pattern => '%Y-%m-%d %H:%M');
-sub parse_date {
-    my ($self, $datetime) = @_;
-    # Calendar doesn't have %H:%M part (spotted by id:mad-capone)
-    return unless defined $datetime;
-    $datetime .= ' 00:00' unless $datetime =~ /\d+:\d+$/;
-    Plagger::Date->parse($format, $datetime);
-sub add_entry {
-    my ($self, $context, $type, $feed, $msg) = @_;
-    if ($type eq 'Log') {
-        $msg->{subject} = $msg->{time} . ' ' . $msg->{name};
-    }
-    my $entry = Plagger::Entry->new;
-    $entry->title($msg->{subject});
-    $entry->link($msg->{link});
-    $entry->author($msg->{name});
-    $entry->date( $self->parse_date($msg->{time}) );
-    $entry->date->set_time_zone('Asia/Tokyo') if $entry->date;
-    if ($self->conf->{show_icon} && !$self->{blocked} && defined $MAP->{$type}->{icon}) {
-        my $owner_id = $msg->{link}->query_param($MAP->{$type}->{icon});
-        $context->log(info => "Fetch icon of id=$owner_id");
-        my $item = $self->cache->get_callback(
-            "outline-$owner_id",
-            sub {
-                Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
-                my $item = $self->{mixi}->show_friend->parse(id => $owner_id)->{outline};
-                $item;
-            },
-            '12 hours',
-        );
-        if ($item && $item->{image} !~ /no_photo/) {
-            # prefer smaller image
-            my $image = $item->{image};
-               $image =~ s/\.jpg$/s.jpg/;
-            $entry->icon({
-                title => $item->{name},
-                url   => $image,
-                link  => $item->{link},
-            });
+            my $item = $self->cache->get_callback(
+                "outline-$owner_id",
+                sub {
+                    Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
+                    my $item = $self->{mixi}->show_friend->parse(id => $owner_id)->{outline};
+                    $item;
+                },
+                '12 hours',
+            );
+            if ($item && $item->{image} !~ /no_photo/) {
+                # prefer smaller image
+                my $image = $item->{image};
+                   $image =~ s/\.jpg$/s.jpg/;
+                $entry->icon({
+                    title => $item->{name},
+                    url   => $image,
+                    link  => $item->{link},
+                });
+            }
-    }
-    my @comments;
-    if ($self->conf->{fetch_body} && !$self->{blocked} && $msg->{link} =~ /view_/ && defined $MAP->{$type}->{get_detail}) {
-        # view_enquete is not implemented and probably
-        # won't be implemented as it seems redirected to
-        # reply_enquete
-        return if $msg->{link} =~ /view_enquete/;
-        $context->log(info => "Fetch body from $msg->{link}");
-        my $item = $self->cache->get_callback(
-            "item-".$msg->{link},
-            sub {
-                Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
-                my $item = $self->{mixi}->parse($msg->{link});
-                $item;
-            },
-            '12 hours',
-        );
-        if ($item) {
-            my $body = $item->{description};
-               $body =~ s!(\r\n?|\n)!<br />!g;
-            for my $image (@{ $item->{images} || [] }) {
-                $body .= qq(<div><a href="$image->{link}"><img src="$image->{thumb_link}" style="border:0" /></a></div>);
-                my $enclosure = Plagger::Enclosure->new;
-                $enclosure->url($image->{thumb_link});
-                $enclosure->auto_set_type;
-                $enclosure->is_inline(1);
-                $entry->add_enclosure($enclosure);
-            }
-            $entry->body($body);
+        my @comments;
+        if ($self->conf->{fetch_body} && !$blocked && $msg->{link} =~ /view_/ && defined $MAP->{$type}->{get_detail}) {
+            # view_enquete is not implemented and probably
+            # won't be implemented as it seems redirected to
+            # reply_enquete
+            next if $msg->{link} =~ /view_enquete/;
+            $context->log(info => "Fetch body from $msg->{link}");
+            my $item = $self->cache->get_callback(
+                "item-".$msg->{link},
+                sub {
+                    Time::HiRes::sleep( $self->conf->{fetch_body_interval} || 1.5 );
+                    my $item = $self->{mixi}->parse($msg->{link});
+                    $item;
+                },
+                '12 hours',
+            );
+            if ($item) {
+                my $body = $item->{description};
+                   $body =~ s!(\r\n?|\n)!<br />!g;
+		if ( $self->conf->{fetch_image} ) {
+		    while ($body =~ /<a href=.*?&#39;(.*?)&#39;.*?<\/a>/) {
+			   my $imagelink = "http://mixi.jp/" . $1;
+			      $imagelink =~ s!&#38;!&!g;
+			   my $pic = $self->{mixi}->parse($imagelink);
+			   if ($pic->{image}) {
+			       my $filename = $pic->{link};
+			       $filename =~ s!.*/!!;
+			       open(OUT, ">" . $self->conf->{fetch_image_path} . "/images/mixi/bbs/" . $filename);
+			       print OUT $pic->{image};
+			       close(OUT);
+			       my $imagepath = "/images/mixi/bbs/$filename";
+			       $body =~ s!<a href=.*?&#39;.*?&#39;.*?</a>!<a href="$imagelink"><img src="$imagepath"/></a>!;
+			   } else {
+			       $body =~ s!<a href=.*?&#39;(.*?)&#39;.*?</a>!<object type="text/html" data="http://mixi.jp/$1">http://mixi.jp/$1</object>!;
+			   }
+		    }
+		} else {
+		    $body =~ s!<a href=.*?&#39;(.*?)&#39;.*?</a>!<object type="text/html" data="http://mixi.jp/$1">http://mixi.jp/$1</object>!g;
+		}
+		$body =~ s!&#38;!&!g;
+                $entry->body($body);
-            $entry->date( $self->parse_date($item->{time}) );
-            $entry->date->set_time_zone('Asia/Tokyo') if $entry->date;
-            if ($self->conf->{fetch_comment}) {
-              for my $comment (@{ $item->{comments} || [] }) {
-                  my $c = Plagger::Entry->new;
-                     $c->title($entry->title . ': '. $comment->{subject});
-                     $c->body($comment->{description});
-                     $c->link($comment->{link});
-                     $c->author($comment->{name});
-                     $c->date( $self->parse_date($comment->{time}) );
-                     $c->date->set_time_zone('Asia/Tokyo') if $c->date;
-                  push @comments, $c;
-              }
+                $entry->date( Plagger::Date->parse($format, $item->{time}) );
+		if ($entry->date) {
+		    $entry->date->set_time_zone('Asia/Tokyo');
+		}
+                if ($self->conf->{fetch_comment}) {
+                  for my $comment (@{ $item->{comments} || [] }) {
+                      my $c = Plagger::Entry->new;
+                         $c->title($subject . ': '. $comment->{subject});
+		      $body = $comment->{description};
+		      if ( $self->conf->{fetch_image} ) {
+			  while ($body =~ /<a href=.*?&#39;(.*?)&#39;.*?<\/a>/) {
+			      my $imagelink = "http://mixi.jp/" . $1;
+			      $imagelink =~ s!&#38;!&!g;
+			      my $pic = $self->{mixi}->parse($imagelink);
+			      if ($pic->{image}) {
+				  my $filename = $pic->{link};
+				  $filename =~ s!.*/!!;
+				  open(OUT, ">" . $self->conf->{fetch_image_path} . "/images/mixi/bbs/" . $filename);
+				  print OUT $pic->{image};
+				  close(OUT);
+				  my $imagepath = "/images/mixi/bbs/$filename";
+				  $body =~ s!<a href=.*?&#39;.*?&#39;.*?</a>!<a href="$imagelink"><img src="$imagepath"/></a>!;
+			      } else {
+				  $self->log(debug => "bbs_comment_picture $1 something wrong");
+				  $body =~ s!<a href=.*?&#39;(.*?)&#39;.*?</a>!<object type="text/html" data="http://mixi.jp/$1">http://mixi.jp/$1</object>!;
+			      }
+			  }
+		      } else {
+			  $body =~ s!<a href=.*?&#39;(.*?)&#39;.*?</a>!<object type="text/html" data="http://mixi.jp/$1">http://mixi.jp/$1</object>!g;
+		      }
+		      $body =~ s!&#38;!&!g;
+                         $c->body($body);
+		      $link = $comment->{link};
+		      $link =~ s/&comment_count=[0-9]*//;
+                         $c->link($link);
+                         $c->author($comment->{name});
+                         $c->date( Plagger::Date->parse($format, $comment->{time}) );
+		      if ($c->date) {
+			  $c->date->set_time_zone('Asia/Tokyo');
+		      }
+                      push @comments, $c;
+                  }
+                }
+            } else {
+                $context->log(warn => "Fetch body failed. You might be blocked?");
+                $blocked++;
-        } else {
-            $context->log(warn => "Fetch body failed. You might be blocked?");
-            $self->{blocked}++;
-    }
-    $feed->add_entry($entry);
-    for my $comment ( @comments ) {
-        $feed->add_entry($comment);
+        $feed->add_entry($entry);
+        for my $comment ( @comments ) {
+            $feed->add_entry($comment);
+        }
+	$context->update->add($feed);
@@ -320,10 +445,6 @@
 With this option set, this plugin fetches users buddy icon from
 mixi.jp site, which makes the output HTML very user-friendly.
-=item split_bbs_feed
-With this option set, BBS feed will be split up. Defaults to 0.
 =item feed_type
 With this option set, you can set the feed types.
      fetch_image: 1
      fetch_image_path: /var/www/fastladder/public
とか書くと、$fetch_image_path/images/mixi に画像が保存される。
本来は、CustomFeed::MixiScraperの段階では Plagger::Entry に Plagger::Enclosureを入れておいて、Store::Fastladder でファイル出力した方が正しい(gmailとかにも添付で送れるはずだし)と思うんだけど、あくまでやっつけなので。(言い訳ばっかりだな)



このブログ記事を参照しているブログ一覧: CustomFeed::MixiScraperでmixiの画像をローカル保存

このブログ記事に対するトラックバックURL: https://www.wizard-limit.net/cgi-bin/mt/mt-tb.cgi/2130



このページは、falseが2009年9月11日 09:58に書いたブログ記事です。

ひとつ前のブログ記事は「fastladder を Passenger で」です。




Powered by Movable Type 6.1.1