#!/usr/bin/perl use strict; use warnings; use utf8; use lib 'lib'; use Encode; use URI; use LWP::UserAgent; use Web::Scraper; use HTML::TreeBuilder::LibXML; use DateTime; use DateTime::Duration; use YAML::Tiny; use XML::Feed; use FindBin; use Net::Twitter; use Data::Dumper; use Unko::Schema; our $DEBUG = 0; if ($DEBUG) { binmode STDOUT, ":utf8"; } else { binmode STDOUT, ":encoding(iso-2022-jp)"; } my $web = (YAML::Tiny->read($FindBin::Bin . '/unko_web.yaml'))->[0]; my $info; my @info; for my $k (@{$web->{'info'}}) { print "area :", $k->{'name'}, "\n" if $DEBUG; print "url :", $k->{'url'}, "\n" if $DEBUG; print "start:", $k->{'start'}, "\n" if $DEBUG; print "end :", $k->{'end'}, "\n" if $DEBUG; $info = &get_traffic_info($k->{'name'}, URI->new($k->{'url'}), $k->{'start'}, $k->{'end'}); push @info, grep { $_->{'update'} = $info->{'update'} } @{$info->{'info'}}; } print Dump \@info if $DEBUG; my $schema = Unko::Schema->connect( 'dbi:SQLite:dbname=/export/home/konishi/perl/unko_ng/db/unkojoho.sqlite', , , { sqlite_unicode => 1, } ); my $rs = $schema->resultset('Unkojoho'); for my $i (grep { not $_->{area} =~ /null/ } @info) { print $i->{'company'}, "\n" if $DEBUG; print $i->{'line'}, "\n" if $DEBUG; print $i->{'status'}, "\n" if $DEBUG; print $i->{'update'}, "\n" if $DEBUG; print $i->{'desc'}, "\n" if $DEBUG; print $i->{'date'}, "\n" if $DEBUG; print $i->{'area'}, "\n" if $DEBUG; print "-----", "\n" if $DEBUG; next unless (exists $i->{'line'}); next unless (defined $i->{'line'}); next unless (exists $i->{'date'}); next unless (defined $i->{'date'}); # next # if ( $i->{'line'} = undef # or $i->{'date'} = undef); # next # if ( $i->{'line'} =~ /^\s*$/ # or $i->{'date'} =~ /^\s*$/); my $entry = $rs->search({line => $i->{'line'}, date => $i->{'date'}}); if ($entry->count == 0) { $rs->create( { line => $i->{'line'}, date => $i->{'date'}, area => $i->{'area'}, message => $i->{'desc'}, company => $i->{'company'}, updated => $i->{'update'}, unkostatus => $i->{'status'}, status => 'got ', } ); } } exit; # scrape sub get_traffic_info { my $area = shift; my $url = shift; my $start = shift; my $end = shift; my $traffic = scraper { process q{//table/tr/td[@class="font01" and text() =~ /JR線、私鉄・地下鉄/}, 'update' => sub { my $date = $_->as_text or return; $date = &guess_year($date)->iso8601; return $date; }; process qq{//table/tr[preceding-sibling::tr/td[text() =~ /$start/] and following-sibling::tr/td[text() =~ /$end/ ] and self::tr[count(td) > 1 ] ] }, 'info[]' => scraper { process '//td[1]', 'company' => 'TEXT'; process '//td[2]', 'line' => 'TEXT'; process '//td[4]', 'status' => 'TEXT'; process '//td[5]', 'desc' => 'TEXT'; process '//td[3]', 'date' => sub { my $date = $_->as_text or return; $date = &guess_year($date)->iso8601; return $date; }; process '//td[1]', 'area' => ['TEXT', sub { return ($_ =~ /^\s*$/) ? "null" : "$area" }]; }; }; my $ua = new LWP::UserAgent(agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; ja; rv:1.9.1.1) Gecko/20090715 Firefox/3.5.1' ); $traffic->user_agent($ua); my $info = $traffic->scrape($url); print Dump $info if $DEBUG; return $info; } sub guess_year { my $value = shift; return unless defined $value; $value =~ m!(\d+)[月/](\d+)日?\s+(\d+)[:時](\d+)分?!; my $date = { month => $1, day => $2, hour => $3, minute => $4 }; my $now = DateTime->now(time_zone => 'Asia/Tokyo'); my $this = $now->clone->truncate(to => 'year')->set(%$date); my $last = $this->clone->subtract(years => 1); my $next = $this->clone->add(years => 1); my @date = sort { DateTime::Duration->compare($a->[1], $b->[1], $now) } map { [$_->[0], $_->[1]->is_positive ? $_->[1] : $_->[1]->inverse] } map { [$_, $now - $_] } ($this, $last, $next); $date[0]->[0]; }