[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py
From: |
Pavel Kharitonov |
Subject: |
www/server/source/linc linc.py |
Date: |
Mon, 15 Feb 2016 10:05:47 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Pavel Kharitonov <ineiev> 16/02/15 10:05:47
Modified files:
server/source/linc: linc.py
Log message:
Process HTML redirections like <meta content="0; refresh".
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.28&r2=1.29
Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.28
retrieving revision 1.29
diff -u -b -r1.28 -r1.29
--- linc.py 27 Oct 2015 08:40:06 -0000 1.28
+++ linc.py 15 Feb 2016 10:05:46 -0000 1.29
@@ -20,7 +20,7 @@
from __future__ import print_function
-LINC_VERSION = 'LINC 0.24'
+LINC_VERSION = 'LINC 0.25'
USAGE = \
'''Usage: %prog [options] [BASE_DIRECTORY]
Check links in HTML files from BASE_DIRECTORY.'''
@@ -104,7 +104,7 @@
HTTP_FORWARD_HEADER = HTTP_VERSION_HEADER + '30[01237] '
HTTP_LINK_REGEXP = \
'http(s?)://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
-HTTP_NEW_LOCATION_HEADER = '(^|\r\n)Location: (?P<new_location>.+)(\r\n|$)'
+HTTP_NEW_LOCATION_HEADER = '(?i)(^|\r\n)Location: (?P<new_location>.+)(\r\n|$)'
LINK_BEGIN = '(?i)(<a\s[^<]*)'
# We want to parse links like href="URL" as well as href='URL';
# I failed to compose a single regexp for that -- ineiev.
@@ -113,6 +113,10 @@
"(?is)^<a(\s.+?)?\shref='(?P<link>[^']+)'(\s.+?)?>"]
TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
SYMLINK_REGEXP='^\s*(?P<to>[^\s]*)\s+(?P<from>[^\s]*).*$'
+EOH_MARK = '\r\n\r\n'
+REFRESH_REGEXP = '''(?is)^\s*<meta\s[^>]*http-equiv=['"]refresh['"]'''
+REFRESH_URL = \
+'''(?is)^\s*<meta\s.*content=['"]\s*0[;,][^'">].*url=(?P<new_location>[^>"']+)['"]'''
# Don't report against commented out link to README.translations.html
LINK_TO_SKIP = '/server/standards/README.translations.html'
@@ -173,9 +177,19 @@
def get_http_link_error(link, link_type, forwarded_from = None):
if forwarded_from == None:
forwarded_from = []
+ if len(forwarded_from) >= FORWARDS_TO_FOLLOW:
+ if VERBOSE > 2:
+ print ('too many forwards:')
+ print (forwarded_from)
+ return 'too many forwards (over ' \
+ + str(FORWARDS_TO_FOLLOW) + ')'
+ if link in forwarded_from:
+ return 'forward loop!'
+ forwarded_from.append(link)
connection_data = re.search (HTTP_LINK_REGEXP, link)
if not connection_data:
+ report (2, 'No connection data in ' + link)
return None
hostname = connection_data.group ('hostname')
port = connection_data.group ('port')
@@ -185,6 +199,7 @@
# if a socket couldn't be created,
# just ignore this link this time.
if socketfd == None:
+ report (2, 'No socket for ' + link)
return None
if port == None:
@@ -219,7 +234,7 @@
if webpage == None:
return 'couldn\'t read from socket'
- end_of_headers = webpage.find('\r\n\r\n')
+ end_of_headers = webpage.find(EOH_MARK)
if end_of_headers == -1:
report(1, 'No end of headers found on webpage (link ' \
+ link + ')')
@@ -230,6 +245,7 @@
+ 'headers (possibly no content in file)'
header = webpage[:end_of_headers]
+ page = webpage[end_of_headers + len(EOH_MARK):]
verb_level = 5
if not re.search (HTTP_VERSION_HEADER, header):
report (1, 'No HTTP version found in header')
@@ -244,13 +260,18 @@
match = re.search (HTTP_FORWARD_HEADER, header)
if not match:
+ match = re.search (REFRESH_REGEXP, page)
+ if not match:
return None
- if len(forwarded_from) >= FORWARDS_TO_FOLLOW:
- if VERBOSE > 2:
- print ('too many forwards:')
- print (forwarded_from)
- return 'too many forwards (over ' \
- + str(FORWARDS_TO_FOLLOW) + ')'
+ match = re.search (REFRESH_URL, page)
+ if not match:
+ return None
+ new_location = match.group('new_location')
+ [link_type, url] = classify_link("", new_location)
+ if url == link:
+ # Refesh to the same URL.
+ return None
+ else:
match = re.search(HTTP_NEW_LOCATION_HEADER, header)
if not match:
report(-2, 'Forwarded location not found')
@@ -261,12 +282,9 @@
print ('Aborting due to bad forward.')
exit(1)
return None
- forwarded_from.append(link)
new_location = match.group('new_location')
[link_type, url] = classify_link("", new_location)
- if new_location in forwarded_from:
- return 'forward loop!'
- return get_http_link_error(new_location, link_type, forwarded_from)
+ return get_http_link_error(url, link_type, forwarded_from)
def is_inside_comment(head):
start = head.rfind('<!--')
@@ -776,8 +794,8 @@
if err != None:
broken_so_far = broken_so_far + 1
if (VERBOSE > 1 and err != None) or VERBOSE > 2:
- print('link ' + str(i) + ': ' + link \
- + ': ' + (err if err else '(no error)'))
+ print(('link ' + str(i) + ': ' + link \
+ + ': ' + (err if err else '(no
error)')).encode('iso-8859-1'))
report(-2, '\n' + str (len (links_to_check)) + ' links, ' \
+ unique_links + ' unique, ' \
+ str (cached_links) + ' cached, ' \
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- www/server/source/linc linc.py,
Pavel Kharitonov <=