[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/planetrss planetrss.pl
From: |
Pavel Kharitonov |
Subject: |
www/server/source/planetrss planetrss.pl |
Date: |
Mon, 09 Dec 2013 04:33:18 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Pavel Kharitonov <ineiev> 13/12/09 04:33:18
Modified files:
server/source/planetrss: planetrss.pl
Log message:
Assume all removed tags may have attributes;
output the result to stdout; remove links by default;
handle some special cases of description.
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/planetrss/planetrss.pl?cvsroot=www&r1=1.16&r2=1.17
Patches:
Index: planetrss.pl
===================================================================
RCS file: /web/www/www/server/source/planetrss/planetrss.pl,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -b -r1.16 -r1.17
--- planetrss.pl 22 May 2011 15:29:55 -0000 1.16
+++ planetrss.pl 9 Dec 2013 04:33:17 -0000 1.17
@@ -1,327 +1,257 @@
- # PlanetRSS, Version 1.3
- # Copyright © 2011 Shailesh Ghadge
+# PlanetRSS: fetch feeds from planetgnu.org and output them as HTML.
+#
+# Copyright © 2011 Shailesh Ghadge
+# Copyright © 2013 Free Software Foundation, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+# Email: address@hidden
+
+use strict;
+
+# Provides simple pure perl RSS parsing.
+use XML::RSS::Parser::Lite;
+
+# Provides get(url) function.
+use LWP::Simple;
+
+# Provides argument handling.
+use Getopt::Long;
+
+my $Version = "1.4";
+my $default_lines = 3;
+my $default_length = 200;
+my $head = "<!-- Autogenerated by planetrss.pl ".$Version." -->\n";
+
+# Number of feeds to output.
+my $FeedLines = $default_lines;
+
+# Number of characters per feed.
+my $FeedLength = $default_length;
+
+# Other options.
+my $a = 0;
+my $b = 0;
+my $code = 0;
+my $div = 0;
+my $em = 0;
+my $h = 0;
+my $hr = 0;
+my $i = 0;
+my $img = 0;
+my $p = 0;
+my $pre = 0;
+my $strong = 0;
+my $table = 0;
+my $textarea = 0;
+my $tt = 0;
+my $ul = 0;
+my $help;
+my $version;
+
+sub strip_tag
+{
+ my $str = shift;
+ my $tag = shift;
+ my $repl = shift;
+
+ $str =~ s/<$tag(.*?)>/$repl/gi;
+ $str =~ s/<\/$tag>//gi;
+
+ return $str;
+}
+
+my $feeds = get ("http://planet.gnu.org/rss20.xml");
+
+GetOptions ("a=i" => \$a, "b=i" => \$b, "code=i" => \$code, "div=i" => \$div,
+ "em=i" => \$em, "h=i" => \$h, "hr=i" => \$hr, "i=i" => \$i,
+ "img=i" => \$img, "p=i" => \$p, "pre=i" => \$pre,
+ "strong=i" => \$strong, "table=i" => \$table,
+ "textarea=i" => \$textarea, "tt=i" => \$tt, "ul=i" => \$ul,
+ "FeedLines=i" => \$FeedLines, "FeedLength=i" => \$FeedLength,
+ "help" => \$help, "version" => \$version);
+
+if ($help)
+ {
+ print "Usage: perl planetrss.pl [options]
+
+Options:
+
+ -FeedLines=n The number of feeds to output
+ -FeedLength=m The length of feed
+
+ -a=1 preserve a href tags
+ -b=1 preserve b tags
+ -code=1 preserve code tags
+ -div=1 preserve div tags
+ -em=1 preserve em tags
+ -h=1 preserve h tags
+ -hr=1 preserve hr tags
+ -i=1 preserve i tags
+ -img=1 preserve img tags
+ -p=1 preserve p tags
+ -pre=1 preserve pre tags
+ -strong=1 preserve strong tags
+ -table=1 preserve table, tr, th tags
+ -textarea=1 preserve textarea tags
+ -tt=1 preserve tt tags
+ -ul=1 preserve ul, ol, dl, li tags
- #This program is free software: you can redistribute it and/or modify
- #it under the terms of the GNU General Public License as published by
- #the Free Software Foundation, either version 3 of the License, or
- #(at your option) any later version.
- #
- #This program is distributed in the hope that it will be useful,
- #but WITHOUT ANY WARRANTY; without even the implied warranty of
- #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- #GNU General Public License for more details.
- #
- #You should have received a copy of the GNU General Public License
- #along with this program. If not, see <http://www.gnu.org/licenses/>.
-
- #Email: address@hidden #Version Date: 27 Apr 2011
- #
- #Functionality: Compare with previously retrieved feeds(if any) and
then if required,
- # Fetch & save 'n' feeds from planet.gnu.org using RSS
feed link http://planet.gnu.org/rss20.xml in html format
- # Each feed is truncated to 'm' characters.
- # User control over:-
- # 1.retaining/removal of html tags
- # 2.number of Feeds & Feed length
- # 3.output path
- # 4.forced write
-
-
- #--------------------------------------------
- use XML::RSS::Parser::Lite;
- #Provides simple pure perl RSS parsing
-
- use LWP::Simple;
- #Provides get(url) function
-
- use Getopt::Long;
- #Provides arguement handling
- #---------------------------------------------
- my $FeedLines = 3; # 'n' feeds
- my $FeedLength = 200; # 'm' characters
-
- my $PGfeeds = get("http://planet.gnu.org/rss20.xml");
- #Fetch RSS feeds as xml
-
- #Options for Sanitization (value 1 implies tag will not be stripped,
any other value implies tag will be stripped)
- my $a = 1; my $b = 0; my $code = 0; my $div = 0; my $em = 0; my $h =
0; my $hr = 0; my $i = 0; my $img = 0; my $p = 0; my $pre = 0; my $strong = 0;
- my $table = 0; my $textarea = 0; my $tt = 0; my $ul = 0;
-
- #Options
- my $help; my $version; my $PGpath = "planetfeeds.html"; my $f;
-
- #Set values as per agruements
- GetOptions("a=i" => \$a, "b=i" => \$b, "code=i" => \$code, "div=i" =>
\$div, "em=i" => \$em, "h=i" => \$h, "hr=i" => \$hr, "i=i" => \$i,"img=i" =>
\$img,
- "p=i" => \$p, "pre=i" => \$pre, "strong=i" => \$strong,
"table=i" => \$table, "textarea=i" => \$textarea, "tt=i" => \$tt, "ul=i" =>
\$ul,
- "FeedLines=i" => \$FeedLines, "FeedLength=i" =>
\$FeedLength, "help" => \$help, "version" => \$version, "path=s" => \$PGpath,
- "f" => \$f);
+ -help Display help and exit
+ -version Display version and exit
- #------------------------------Help-------------------------------
- if($help)
- {
- print "
-
-Usage: perl planetrss.pl [-options]
-------------------------------------------------------------------------------
Defaults:
- Number of Feeds= 3,
- Length of Feed = 200,
- except 'a' tag, all above tags are removed.
-
-Feed control options:
- -FeedLines=n 'n' is the number of Feeds
- -FeedLength=m 'm' is the length of Feed
-
-Force Write:
- -f Overwrites existing outputfile (even if the
latest feed from RSS & First feed of Previous outputfile is same)
-
-Help:
- -help
-
-Output Path:
- -path=\"/path\" Set the output path,
- eg: -path=\"/www/planetfeeds.html\"
- -path=\"../www/\"
- -path=\"../www\"
-
-Tag preserve options:
- -a=1 a href tag will not be removed
- -b=1 b tag will not be removed
- -code=1 code tag will not be removed
- -div=1 div tag will not be removed
- -em=1 em tag will not be removed
- -h=1 h tag will not be removed
- -hr=1 hr tag will not be removed
- -i=1 tag will not be removed
- -img=1 img tag will not be removed
- -p=1 p tag will not be removed
- -pre=1 pre tag will not be removed
- -strong=1 strong tag will not be removed
- -table=1 table,tr,th tags will not be removed
- -textarea=1 textarea tag will not be removed
- -tt=1 tt tag will not be removed
- -ul=1 ul li tags will not be removed
-
-Tag removal options:
- Syntax same as in 'tag perserve'.
- Set value to 0 or any number other than 1.
-
-Version Info:
- -version
-------------------------------------------------------------------------------
-Some Examples:
- perl planetrss.pl -f -FeedLines=7 -FeedLength=500
-path=\"../www/planetfeeds.html\"
+ -FeedLines=".$default_lines." -FeedLength=".$default_length."
+
+Examples:
+ perl planetrss.pl -FeedLines=7 -FeedLength=500
perl planetrss.pl -version
perl planetrss.pl -help
- perl planetrss.pl -i=1 -hr=1 -a=0
+ perl planetrss.pl -i=1 -hr=1 -a=1
-\n";
+";
exit;
}
- #-------------------------------End Help-----------------------------
- #-------------------------------Version------------------------------
- if($version)
+if ($version)
{
- print "
-------------------------------------------------------------------
- PlanetRSS, Version 1.3
- Copyright © 2011 Shailesh Ghadge
- License: GPLv3 Contact: address@hidden
- Version: 1.3 Version released on: 27 April 2011
-------------------------------------------------------------------\n";
+ print "PlanetRSS ".$Version."
+Copyright (C) 2011 Shailesh Ghadge
+Copyright (C) 2013 Free Software Foundation, Inc.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+";
exit;
}
- #------------------------------End Version--------------------------
+my $PGparser = new XML::RSS::Parser::Lite;
- my $PGparser = new XML::RSS::Parser::Lite;
- #Create new RSS parser
+$PGparser->parse ($feeds);
- $PGparser->parse($PGfeeds);
- #To Parse the supplied xml
+print $head;
- #-----------------------------Path Check----------------------------
- if(-d $PGpath)
- {
- if(substr($PGpath,length($PGpath)-1) eq "/")
- {
- $PGpath=$PGpath."planetfeeds.html";
- }
- else
+for (my $i = 0; $i < $FeedLines; $i++)
{
- $PGpath=$PGpath."\/planetfeeds.html";
- }
- }
- #-------------------------------------------------------------------
- my $Write2File = 1; #Default: We write to PlanetFeeds.html;
- #--------------------------------Check------------------------------
- if(!$f) # If force write flag is set, then no need to check
- {
- my $CompareFeeds = 1;
- #open (CurPGhtml, 'planetfeeds.html') || $CompareFeeds--;
- open (CurPGhtml, $PGpath) || $CompareFeeds--;
- if($CompareFeeds == 1)
- {
- my @Cur_Content = <CurPGhtml>;
- my $Cur_Title = $Cur_Content[0];
- $Cur_Title=~ s/<(.*?)>//gi; $Cur_Title=~
s/<a(.*?)>//gi; $Cur_Title=~ s/<\/a>//gi;
- $Cur_Title=~ s/<p>//gi; $Cur_Title=~
s/<\/p>//gi; $Cur_Title=~ s/<li>//gi;
- $Cur_Title=~ s/<ul>//gi; $Cur_Title=~ s/<br
\/>//gi; $Cur_Title =~ s/\s\s+/ /g;
- $Cur_Title=substr($Cur_Title,0,index($Cur_Title,':'));
- #Now we have Current Title
-
- my $New_Checker = $PGparser->get(0);
- my $New_Title = $New_Checker->get('title');
- $New_Title = substr($New_Title,index($New_Title,':')+2);
- #Now we have New Title
+ my $feed = $PGparser->get ($i);
+ my $url = $feed->get ('url');
+ my $title = $feed->get ('title');
+ my $desc = $feed->get ('description');
+ my $tail = "...\n<a href='".$url."'>more</a></p>\n";
+ my $null_tail = "</p>";
+ my $null_head = "";
+ my $d0;
+ $head = ":\n";
- if($Cur_Title eq $New_Title)
- {
- $Write2File=0;
- }
- #Decide whether to continue & write PGhtml
- }
- close(CurPGhtml);
- }
- #-------------------------End of Check-----------------------------
-
- #print "content-type: text/html \n";
- # Use above if you get errors regarding headers
+ # Remove Blog name.
+ $title = substr ($title, index ($title, ':') + 2);
- #---------------------To Create/Overwrite PlanetFeeds.html-----------
- if($Write2File==1)
- {
- my $PGhead= "<!-- Autogenerated File by planetrss.pl
http://web.cvs.savannah.gnu.org/viewvc/www/server/source/planetrss/?root=www
-->";
- #open (PGhtml, '>planetfeeds.html');
- open (PGhtml, '>'.$PGpath);
- print PGhtml $PGhead;
- #Print Feeds data in the format of- "Title - Description... <a
href='URL'>more</a>"
- for (my $i = 0; $i < $FeedLines; $i++)
- {
- my $PGfeed = $PGparser->get($i);
- my $PGurl = $PGfeed->get('url');
- my $PGtitle = $PGfeed->get('title');
- my $PGdesc = $PGfeed->get('description');
+ # Check whether there are at least 12 characters for description.
+ $desc = "" unless length ($title) < ($FeedLength - 23);
- $PGtitle = substr($PGtitle,index($PGtitle,':')+2);
#Remove Blog name
-
- #constraint: atleast 12 characters space for displaying
description
- if(length($PGfeed->get('title')) >= ($FeedLength-23))
- {
- print PGhtml "<p><a
href='".$PGurl."'>".substr($PGtitle, 0, $FeedLength) ."</a></p>\n";
- }
- else
- {
- #Sanitize Description
- #$PGdesc=~ s/<(.*?)>//gi;
- $PGdesc=~ s/<br \/>//gi; $PGdesc
=~ s/\s\s+/ /g;#remove whitespace
- if($a!=1)
- {
- $PGdesc=~ s/<a(.*?)>//gi;
$PGdesc=~ s/<\/a>//gi;
- }
- if($b!=1)
- {
- $PGdesc=~ s/<b>//gi;
$PGdesc=~ s/<\/b>//gi;
- }
- if($code!=1)
- {
- $PGdesc=~ s/<code>//gi;
$PGdesc=~ s/<\/code>//gi;
- }
- if($div!=1)
+ # Sanitize description.
+ $desc =~ s/<br \/>//gi; $desc =~ s/\s\s+/ /g;
+ $desc = strip_tag ($desc, "a") unless $a == 1;
+ $desc = strip_tag ($desc, "b") unless $b == 1;
+ $desc = strip_tag ($desc, "code") unless $code == 1;
+ $desc = strip_tag ($desc, "div") unless $div == 1;
+ $desc = strip_tag ($desc, "em") unless $em == 1;
+ $desc = strip_tag ($desc, "i") unless $i == 1;
+ $desc = strip_tag ($desc, "img") unless $img == 1;
+ $desc = strip_tag ($desc, "h\d") unless $h == 1;
+ $desc = strip_tag ($desc, "hr") unless $hr == 1;
+ $desc = strip_tag ($desc, "p") unless $p == 1;
+ $desc = strip_tag ($desc, "pre") unless $pre == 1;
+ $desc = strip_tag ($desc, "strong") unless $strong == 1;
+ if ($table != 1)
{
- $PGdesc=~ s/<div(.*?)>//gi;
$PGdesc=~ s/<\/div>//gi;
+ $desc = strip_tag ($desc, "table");
+ $desc = strip_tag ($desc, "t[hrd]");
}
- if($em!=1)
+ if ($textarea != 1)
{
- $PGdesc=~ s/<em>//gi;
$PGdesc=~ s/<\/em>//gi;
+ $desc = strip_tag ($desc, "textarea");
+ $desc =~ s/<textarea(.*?)>//gi; $desc=~ s/<\/textarea>//gi;
}
- if($i!=1)
+ $desc = strip_tag ($desc, "tt") unless $tt == 1;
+ if ($ul != 1)
{
- $PGdesc=~ s/<i>//gi;
$PGdesc=~ s/<\/i>//gi;
+ $desc = strip_tag ($desc, "[duo]l");
+ $desc = strip_tag ($desc, "li", " * ");
}
+ $desc =~ s/</</gi; $desc =~ s/>/>/gi;
+ $desc =~ s/&lt;/</gi; $desc =~ s/&gt;/>/gi;
+ $desc =~ s/"/"/gi;
+ $desc =~ s/\s\s+/ /g;
- if($img!=1)
- {
- $PGdesc=~ s/<img(.*?)>//gi;
$PGdesc=~ s/<\/img>//gi;
- }
- if($h!=1)
- {
- $PGdesc=~ s/<h(.*?)>//gi;
$PGdesc=~ s/<\/h(.*?)>//gi;
- }
+ # Empty description: no "more", no ":".
+ $tail = $null_tail unless length ($desc);
+ $head = "" unless length ($desc);
- if($hr!=1)
- {
- $PGdesc=~ s/<hr>//gi;
$PGdesc=~ s/<\/hr>//gi;
- }
- if($p!=1)
- {
- $PGdesc=~ s/<p>//gi;
$PGdesc=~ s/<\/p>//gi;
- }
- if($pre!=1)
- {
- $PGdesc=~ s/<pre>//gi;
$PGdesc=~ s/<\/pre>//gi;
- }
- if($strong!=1)
- {
- $PGdesc=~ s/<strong>//gi;
$PGdesc=~ s/<\/strong>//gi;
- }
- if($table!=1)
- {
- $PGdesc=~ s/<table(.*?)>//gi;
$PGdesc=~ s/<\/table>//gi;
- $PGdesc=~ s/<tr(.*?)>//gi;
$PGdesc=~ s/<\/tr>//gi;
- $PGdesc=~ s/<th(.*?)>//gi;
$PGdesc=~ s/<\/th>//gi;
- }
- if($textarea!=1)
- {
- $PGdesc=~ s/<textarea(.*?)>//gi;
$PGdesc=~ s/<\/textarea>//gi;
- $PGdesc=~ s/<textarea(.*?)>//gi;
$PGdesc=~ s/<\/textarea>//gi;
- }
- if($tt!=1)
- {
- $PGdesc=~ s/<tt>//gi;
$PGdesc=~ s/<\/tt>//gi;
- }
- if($ul!=1)
- {
- $PGdesc=~ s/<ul(.*?)>//gi;
$PGdesc=~ s/<\/ul>//gi;
- $PGdesc=~ s/<li(.*?)>//gi;
$PGdesc=~ s/<\/li>//gi;
- }
- $PGdesc=~ s/</</gi; $PGdesc=~ s/>/>/gi;
$PGdesc=~ s/&lt;/</gi; $PGdesc=~ s/&gt;/>/gi;
- $PGdesc=~ s/"/"/gi;
- #------End of Sanitization
+ $d0 = $desc;
- #Predict & resolve 'a' tag breaking
- if($a==1) # If a tags are included
+ if($a == 1) # a tags are included
{
- $PGdesc_front = substr($PGdesc,
0,($FeedLength-(10+length($PGtitle))));
- $PGdesc_rear = substr($PGdesc,
($FeedLength-(10+length($PGtitle))));
-
if(substr($PGdesc_front,($FeedLength-(10+length($PGtitle))-1)) eq "<") #Fix for
line cut at '<'
+ # Will this work when other tags are included?
+ my $start_a;
+ my $end_a;
+ my $front = substr($desc, 0,($FeedLength-(10+length($title))));
+ my $rear = substr($desc, ($FeedLength-(10+length($title))));
+
+ if(substr($front,($FeedLength-(10+length($title))-1)) eq "<")
{
- $PGdesc_front = substr($PGdesc,
0,($FeedLength-(10+length($PGtitle)))+1);
- $PGdesc_rear = substr($PGdesc,
($FeedLength-(10+length($PGtitle)))+1);
+ # Avoid line cut at '<'.
+ $front = substr($desc, 0,($FeedLength-(10+length($title)))+1);
+ $rear = substr($desc, ($FeedLength-(10+length($title)))+1);
}
- while ($PGdesc_front =~ /<a/gi) {
$start_a++ }
- while ($PGdesc_front =~ /<\/a>/gi) {
$end_a++ }
+ while ($front =~ /<a/gi) { $start_a++ }
+ while ($front =~ /<\/a>/gi) { $end_a++ }
- if($start_a != $end_a)
+ $front = $front.substr($rear,0,index($rear,'</a>') + 4)
+ unless $start_a == $end_a;
+ $desc = $front;
+ }
+ else # a tags are removed
{
- $PGdesc_front =
$PGdesc_front.substr($PGdesc_rear,0,index($PGdesc_rear,'</a>')+4);
+ $desc = substr($desc, 0, ($FeedLength-(10+length($title))));
}
- $PGdesc=$PGdesc_front;
- }
- else # If a tags are removed
+ # If we print whole description, there is no "more".
+ $tail = $null_tail unless $d0 ne $desc;
+
+ # Kill trailing spaces and possible start of tag.
+ $desc =~ s/[<\s]*$//;
+
+ if ($tail ne $null_tail)
{
- #Truncate Description
- $PGdesc = substr($PGdesc,
0,($FeedLength-(10+length($PGtitle)))); #10 characters removed for ': ' and
'... more'
+ # Kill ending commas.
+ $desc =~ s/[.,:;]*$//;
+ # Produce "!.." and "?.." rather than "!..." and "?...".
+ $tail =~ s/.// if $desc =~ /[!?]$/;
}
- #Output
- print PGhtml "<p><a
href='".$PGurl."'>".$PGtitle ."</a>: ".$PGdesc. "... <a
href='".$PGurl."'>more</a></p>\n";
- }
+ print "<p><a href='".$url."'>\n".$title ."</a>".$head.$desc.$tail;
}
- close(PGhtml);
- }
- #------------------------------End of To Create/Overwrite
PlanetFeeds.html-------------------------
-
-
- www/server/source/planetrss planetrss.pl,
Pavel Kharitonov <=