[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py
From: |
Waclaw Jacek |
Subject: |
www/server/source/linc linc.py |
Date: |
Sun, 18 Sep 2011 21:36:18 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Waclaw Jacek <wwj> 11/09/18 21:36:18
Modified files:
server/source/linc: linc.py
Log message:
Updating source code of LINC
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.1&r2=1.2
Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -b -r1.1 -r1.2
--- linc.py 14 Mar 2011 18:44:23 -0000 1.1
+++ linc.py 18 Sep 2011 21:34:56 -0000 1.2
@@ -15,28 +15,32 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-# ---------------------------------------------------------------------
-#
-# Command format: "python checklinks.py <local_directory_containing_website>
<corresponding_address_on_web_server>"
# defines
+BASE_DIRECTORY = '/home/w/wwj/www-repo/'
+REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
+
+ADDITIONAL_HTTP_HEADERS = 'User-Agent: LINC/alpha\r\nAccept: text/html,
text/plain, audio/mod, image/*, application/msword, application/pdf,
application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n' # end every
header with "\r\n"
DELAY_BETWEEN_CHECKS = 1 # In seconds. Set to 0 to disable delay between
checks of different links.
DELAY_BETWEEN_RETRIES = 10 # In seconds. Used when a link fails before
re-checking it. Set to 0 to disable delays.
-NUMBER_OF_ATTEMPTS = 3 # Set to 1 for the program to check links just once,
without retrying in case of failure. 0 makes the program not even check the
link at all, so it is not the most recommended of values.
+FORWARDS_TO_FOLLOW = 5 # How many forwards should be followed.
+NUMBER_OF_ATTEMPTS = 3 # Number of times to check a link for error. If an
attempt is successful, the link is no longer checked during that program run.
REPORT_FILE_NAME = 'reports-temp/broken_links' # Path to the file to which the
errors will be reported.
SEPARATE_TRANSLATION_REPORTS = True # If you set this to True, reports for
translations will be saved into "REPORT_FILE_NAME.lang" instead of in the main
report file.
SKIP_TRANSLATION_FILES = False # If you set this to True, files with
translations will be skipped.
SOCKET_TIMEOUT = 20 # After what time to give up with trying to retrieve a
website.
-EXCLUDED_DIRECTORIES_REGEXP = '(wwwes$|wwwin$|education/fr|software/[^/]+$)' #
Matching directories will not be entered to check their files or subdirectories.
-EXCLUDED_FILENAMES_REGEXP = '\.po$' # Matching files will be ignored.
+# regexp-related defines
+
+EXCLUDED_DIRECTORIES_REGEXP =
'^(japan|wwwes|wwwin|education/fr|software/[^/]+)$' # Matching directories will
not be entered to check their files or subdirectories.
+FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked.
FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?'
HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) ' # What to treat
as a HTTP error header.
+HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$'
HTTP_LINK_REGEXP =
'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
-LINK_REGEXP = '<a( .+?)? href="([^mailto:].+?)"( .+?)?>'
-LINK_REGEXP_GROUP = 1 # Number of the group that matters in the above regexp.
+HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
+LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>'
TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
# libraries
@@ -76,7 +80,10 @@
socketfd.close()
return None
-def get_http_link_error( link ):
+def get_http_link_error( link, forwarded_from = None ): # forwarded_from is
either None or a list
+ if forwarded_from == None:
+ forwarded_from = []
+
connection_data = re.search( HTTP_LINK_REGEXP, link )
if connection_data:
hostname = connection_data.group( 'hostname' )
@@ -97,7 +104,7 @@
if resource == None:
resource = '/'
- socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' +
hostname + '\r\n\r\n' )
+ socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' +
hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
webpage = socket_read( socketfd )
if webpage == None:
@@ -112,11 +119,44 @@
header_lines = webpage[ : end_of_headers_pos ]
header_lines = header_lines.split( '\r\n' )
- for header_line in header_lines:
- match = re.search( HTTP_ERROR_HEADER, header_line )
+
+ # search for errors
+ match = regexp_search_list( HTTP_ERROR_HEADER, header_lines )
if match:
http_error_code = match.group(
'http_error_code' )
return 'http error ' + http_error_code + '
returned by server'
+
+ # look for forwards
+ match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines )
+ if match:
+ if len( forwarded_from ) < FORWARDS_TO_FOLLOW: # if we
haven't been forwarded too many times yet...
+ match = regexp_search_list(
HTTP_NEW_LOCATION_HEADER, header_lines )
+ if match:
+ forwarded_from.append( link )
+ new_location = match.group(
'new_location' )
+ if new_location in forwarded_from:
+ return 'forward loop!'
+ else:
+ return get_http_link_error(
new_location, forwarded_from )
+ else: # we've been forwarded too many times, sorry.
+ return 'too many forwards (over ' + str( len(
forwarded_from ) ) + ')'
+
+ return None
+
+def is_match_inside_comment_block( regexp_match ):
+ haystack = regexp_match.string
+ match_pos = regexp_match.start()
+ comment_block_start = haystack.rfind( '<!--', 0, match_pos )
+ comment_block_end = haystack.rfind( '-->', 0, match_pos )
+ if comment_block_start > comment_block_end:
+ return True
+ return False
+
+def regexp_search_list( regexp, the_list ):
+ for list_element in the_list:
+ match = re.search( regexp, list_element )
+ if match:
+ return match
return None
def search_directory_for_files( base_directory, directory ):
@@ -129,7 +169,7 @@
search_directory_for_files( base_directory,
relative_path_to_element )
else:
- if re.search( EXCLUDED_FILENAMES_REGEXP, element_name ):
+ if not re.search( FILENAMES_TO_CHECK_REGEXP,
element_name ):
continue
if ( SKIP_TRANSLATION_FILES == True ) and re.search(
TRANSLATION_REGEXP, element_name ):
@@ -162,13 +202,8 @@
### OK, main program below.
-# check if a path has been provided
-if len( sys.argv ) < 3:
- print 'Please run the program with the following arguments: the
directory which should be checked and its corresponding URL on the live site
(so eg.: ./linc.py gnucvs/software/ http://www.gnu.org/software/)'
- sys.exit( 1 )
-
-base_directory = sys.argv[1]
-remote_base_directory = sys.argv[2]
+base_directory = BASE_DIRECTORY
+remote_base_directory = REMOTE_BASE_DIRECTORY
if remote_base_directory[-1] != '/':
remote_base_directory += '/'
@@ -176,7 +211,7 @@
# `cd` to this path
if not os.path.isdir( base_directory ):
- print 'Selected path isn\'t a directory (or doesn\'t exist at all).'
+ print 'Entered base directory isn\'t a directory (or doesn\'t exist at
all).'
sys.exit( 1 )
# list files
@@ -192,8 +227,8 @@
file_contents = fd.read()
fd.close()
- for match in re.findall( LINK_REGEXP, file_contents ):
- link = match[LINK_REGEXP_GROUP]
+ for match in re.finditer( LINK_REGEXP, file_contents ):
+ link = match.group( 'link' )
line_number = -1
split_file_contents = file_contents.split( '\n' )
@@ -203,7 +238,7 @@
line_number = checked_line_number
break
- link_container = { 'filename': file_to_check, 'line_number':
line_number, 'link': link }
+ link_container = { 'filename': file_to_check, 'line_number':
line_number, 'link': link, 'is_link_inside_comment_block':
is_match_inside_comment_block( match ) }
links_to_check.append( link_container )
# check links
@@ -223,6 +258,7 @@
filename = link_container['filename']
line_number = link_container['line_number']
link = link_container['link']
+ is_link_inside_comment_block =
link_container['is_link_inside_comment_block']
link_type = None
@@ -278,6 +314,9 @@
already_checked_links.append( { 'link': link, 'error':
link_error } )
if link_error != None:
+ if is_link_inside_comment_block:
+ link_error += ' (link commented out)'
+
if not SKIP_TRANSLATION_FILES:
match = re.search( TRANSLATION_REGEXP, filename )
if match and SEPARATE_TRANSLATION_REPORTS:
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- www/server/source/linc linc.py,
Waclaw Jacek <=