#!/usr/bin/perl -w use strict; use Encode; use LWP::UserAgent; use HTML::TableExtract; # ex.1 - myalbum.ne.jp my $target_url = 'http://myalbum.ne.jp/myal/price.html'; my $url_charset = 'Shift_JIS'; my %te_opt = ( depth => 1, count => 3 ); # ex.2 - kakaku.com #my $target_url = 'http://www.kakaku.com/sku/price/005021.htm'; #my $url_charset = 'Shift_JIS'; #my %te_opt = ( depth => 1, count => 8 ); my $output_charset = 'Shift_JIS'; my $content = ''; my $ua = LWP::UserAgent->new( agent => 'Mozilla/5.0', timeout => 15 ); my $response = $ua->get($target_url); if($response->is_success) { $content = $response->content; } else { $content = 'ERROR: ' . $response->status_line; } if( $content =~ /^ERROR/ ) { # error print $content; } else { $content =~ s/ / /ig; Encode::from_to($content, $url_charset => 'EUC-JP'); my $result = ''; my $te = HTML::TableExtract->new(%te_opt); $te->parse($content); if(!%te_opt) { foreach my $ts ($te->tables) { $result .= 'Table (' . join('-', $ts->coords) . "):\n"; foreach my $row ($ts->rows) { $result .= join('|', map { (defined && length) ? $_ : '' } @{$row}) . "\n"; } } } else { foreach my $ts ($te->tables) { $result .= 'Table found at ' . join('-', $ts->coords) . ":\n"; foreach my $row ($ts->rows) { $result .= "\t" . join(' | ', map { (defined && length) or $_ = ''; s/กก/ /g; s/[\r\n\t ]+/ /sg; s/^\s+//g; s/\s+$//g; $_ } @{$row}) . "\n"; } } } Encode::from_to($result, 'EUC-JP' => $output_charset) if($output_charset ne 'EUC-JP'); print $result; }