יום שלישי, אוגוסט 27, 2019

Munigrabber לאתרים שמשתמשים בערכת העיצוב Mivzakiim

לפני מספר שנים בניתי תסריט שעבד על חלק מאתרי המועצות המקומיות, השבוע גיליתי שהקוד הישן שלי כבר לא עובד איתן כי הן החליפו את ערכת עיצוב.
התסריט הזה מתאים לערכת העיצוב Mivzakiim.
זה מתבסס על עיצוב האתר כך ששוב זה יישבר כשהם ישנו ערכת עיצוב.

הפעלה לדוגמה עבור מועצה מקומית שקר כל שהוא עם הדומיין
https://www.sheker-kolshehu.co.il
:

munigrabber.pl https://www.sheker-kolshehu.co.il

את הקובץ שנוצר שמים במקום שקורא ה RSS שלנו יכול לקרוא ממנו - למשל איפה שApache לוקח קבצים ממנו .


#!/usr/bin/perl  

use strict;
use warnings;

use LWP 5.64; # Loads all important LWP classes, and makes
use HTTP::Cookies; # Allow work with cookies
use XML::RSS;
use XML::Simple;
use Encode qw(decode encode);
use Data::Dumper;
use DateTime::Format::Strptime qw(strftime );

#The message content is located in a remote page, we fetch it's content to show in our RSS feed.
sub get_display_item
{
    my $url = shift;
    my $browser = shift;
    my $response = $browser->get( $url );
    my @lines = $response->content =~ /<div.*id="WebPartWPQ3"(.*)<\/div>/g;
    $lines[0] = '
new; $browser->cookie_jar( HTTP::Cookies->new( 'file' => '/tmp/headers', # where to read/write cookies 'autosave' => 1, # save it to disk when done )); $browser->agent('Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'); my $basepath = "$ARGV[0]"; my $url = "$basepath"; my $response = $browser->get( $url ); print "Workding on " . $url . "\n"; if ($response->{'_rc'} != 200 ) { die "Error for $url $response->{'_content'}"; } #for each cookie we got, add it to the current jar $browser->cookie_jar->extract_cookies( $response ); $response = $browser->get( "$basepath/Pages/Mivzakiim.aspx" ); my $rss = XML::RSS->new( version => '2.0'); $rss->channel( title => "Municipal feed for $basepath", link => "$basepath", description => "MivzakiimAll RSS feed"); my @lines = $response->content =~ /MivzakiimAll generalFilter">(.*)<\/div><\/div>/g; unless ($#lines >= 0) { print Dumper (@lines) ; die "$url does not have MivzakiimAll content, you need to adjust the script"; } my $xml_obj = XMLin(@lines); my $formater = DateTime::Format::Strptime->new( pattern => '%d/%m/%Y', time_zone => 'local', on_error => 'croak', ); foreach my $tr (@{$xml_obj->{tbody}->{tr}}) { foreach my $td ($tr->{td}) { my $date_in_webpage= $td->[0]->{'div'}->{'content'}; my $dt = $formater->parse_datetime($date_in_webpage); my $title = $td->[1]->{'a'}->{'content'}; my $link = $td->[1]->{'a'}->{'href'}; my $description = get_display_item ( $basepath . $link , $browser); $rss->add_item( title => $title, link => $basepath . $link,pubDate=>$dt ,guid=>$basepath . $link , description=>$description); } } #Use the basepath for the generated filename , strip http/https from it for readbility my $filename="$url.rss"; $filename =~ s/https:\/\///g; $filename =~ s/http:\/\///g; $rss->save($filename); __END__ =head1 NAME muni RSS - a script to get an RSS from old municipal websites with the Mivzakiim template which are lacking an RSS/Atom features. =head1 SYNOPSIS Create an RSS 2.0 file based on the embeeded messages page. =head1 DESCRIPTION This script provide a basic example on how one could get the RSS from the rusty pages without the RSS interface, =head1 BUGS =head1 AUTHOR Original code:Boris Shtrasman =head1 COPYRIGHT Copyright (c) 2015 Boris Shtrasman =head1 LICENSE this script is free software. You can redistribute it and/or modify it under the same terms as Perl itself. =head1 CREDITS Rael Dornfest Jonathan Eisenzopf Wojciech Zwiefka Chris Nandor Shlomi Fish

=head1 SEE ALSO perl(1), XML::Parser(3), LWP(3), XML::RSS(3),HTTP::Cookies(3) =cut

אין תגובות:

הוסף רשומת תגובה