www/server/source/linc linc.py

www-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py

From:	Pavel Kharitonov
Subject:	www/server/source/linc linc.py
Date:	Mon, 03 Dec 2012 10:22:57 +0000
CVSROOT:        /web/www
Module name:    www
Changes by:     Pavel Kharitonov <ineiev>       12/12/03 10:22:57

Modified files:
        server/source/linc: linc.py 

Log message:
        Next revision (RT #786229):
        
        Limit maximum length of lines.
        
        Improve some diagnostic messages.
        
        Add 'education/draft' to excluded directories.
        
        Use 'Lynx/2.8.6rel.5' rather than 'LINC/alpha' as the 'User-Agent' 
header.
        
        Report commented out links (including the working ones) in separate 
files.
        
        Eliminate the 'SEPARATE_TRANSLATION_REPORTS' variable; translation 
reports
        are always separated.
        
        Open, append and close the report files on every report rather than keep
        them all open at once.

CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.3&r2=1.4

Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- linc.py     2 Apr 2012 00:04:40 -0000       1.3
+++ linc.py     3 Dec 2012 10:22:57 -0000       1.4
@@ -2,6 +2,7 @@
 #
 # LINC - LINC Is Not Checklink
 # Copyright Â© 2011-2012 WacÅaw Jacek
+# Copyright Â© 2012 Free Software Foundation, Inc.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,28 +19,67 @@
 
 # defines
 
-BASE_DIRECTORY = '/home/w/wwj/www-repo/'
+BASE_DIRECTORY = '/home/g/gnun/checkouts/www/'
 REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
 
-ADDITIONAL_HTTP_HEADERS = 'User-Agent: LINC/alpha\r\nAccept: text/html, 
text/plain, audio/mod, image/*, application/msword, application/pdf, 
application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n' # end every 
header with "\r\n"
-DELAY_BETWEEN_CHECKS = 1 # In seconds. Set to 0 to disable delay between 
checks of different links.
-DELAY_BETWEEN_RETRIES = 10 # In seconds. Used when a link fails before 
re-checking it. Set to 0 to disable delays.
+# End every header with "\r\n"
+# We say we are like Lynx because some ignorant sites like Sourceforge don't
+# know what LINC/alpha is and still discriminate on User-Agent with the default
+# behavior being inappropriate for us.
+ADDITIONAL_HTTP_HEADERS = 'User-Agent: Lynx/2.8.6rel.5\r\n\
+Accept: text/html, text/plain, audio/mod, image/*, application/msword, \
+application/pdf, application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n'
+
+# In seconds. Set negative to disable delay between checks of different links.
+DELAY_BETWEEN_CHECKS = 1
+
+# In seconds. Used when a link fails before re-checking it.
+# Set negative to disable delays.
+DELAY_BETWEEN_RETRIES = 10
+
 FORWARDS_TO_FOLLOW = 5 # How many forwards should be followed.
-NUMBER_OF_ATTEMPTS = 3 # Number of times to check a link for error. If an 
attempt is successful, the link is no longer checked during that program run.
-REPORT_FILE_NAME = 'reports-temp/broken_links' # Path to the file to which the 
errors will be reported.
-SEPARATE_TRANSLATION_REPORTS = True # If you set this to True, reports for 
translations will be saved into "REPORT_FILE_NAME.lang" instead of in the main 
report file.
-SKIP_TRANSLATION_FILES = False # If you set this to True, files with 
translations will be skipped.
-SOCKET_TIMEOUT = 20 # After what time to give up with trying to retrieve a 
website.
+
+# Number of times to check a link for error. If an attempt is successful,
+# the link is no longer checked during that program run.
+NUMBER_OF_ATTEMPTS = 3
+
+# Path to the file to which the errors will be reported.
+# Note: this is typically a temporary directory because
+# linc run takes many hours (almost a day), and the old results
+# wouldn't be available during that period if new files went
+# directly to the destination directory.
+REPORT_FILE_PREFIX = 'reports-temp'
+
+# File to which the errors will be reported.
+REPORT_FILE_NAME = 'broken_links'
+
+# File to which commented out links will be reported.
+COMMENTED_FILE_NAME = 'commented_out'
+
+# If you set this to True, files with translations will be skipped.
+SKIP_TRANSLATION_FILES = False
+
+# After what time to give up with trying to retrieve a website.
+SOCKET_TIMEOUT = 20
 
 # regexp-related defines
 
-EXCLUDED_DIRECTORIES_REGEXP = 
'^(japan|wwwes|wwwin|education/fr|press|server/staging|software/[^/]+)$|(^|/)po$'
 # Matching directories will not be entered to check their files or 
subdirectories.
-EXCLUDED_FILENAMES_REGEXP = 
'^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
+# Matching directories will not be entered to check their
+# files or subdirectories.
+EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwes|wwwin|education/fr|\
+education/draft|press|server/staging|software/[^/]+)$|(^|/)po$'
+EXCLUDED_FILENAMES_REGEXP = \
+  '^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
+
 FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked.
+
 FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?'
-HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) ' # What to treat 
as a HTTP error header.
+
+# What to treat as a HTTP error header.
+HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) '
 HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$'
-HTTP_LINK_REGEXP = 
'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
+HTTP_LINK_REGEXP = \
+  'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
 HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
 LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>'
 TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
@@ -59,7 +99,9 @@
 # functions
        
 def format_error( filename, line_number, link, error_message ):
-       return str( filename ) + ':' + str( line_number ) + ': ' + str( link 
).replace( ' ', '%20' ) + ' ' + str( error_message ) + '\n'
+       return str( filename ) + ':' + str( line_number ) + ': ' \
+              + str( link ).replace( ' ', '%20' ) + ' ' \
+              + str( error_message ) + '\n'
        
 def get_ftp_link_error( link ):
        connection_data = re.search( FTP_LINK_REGEXP, link )
@@ -71,7 +113,9 @@
                        port = 21
                        
                socketfd = socket_create()
-               if socketfd == None: # if a socket couldn't be created, just 
ignore this link this time.
+               # if a socket couldn't be created,
+               # just ignore this link this time.
+               if socketfd == None:
                        return None
                        
                if socket_connect( socketfd, hostname, port ) == False:
@@ -81,12 +125,14 @@
                socketfd.close()
        return None
 
-def get_http_link_error( link, forwarded_from = None ): # forwarded_from is 
either None or a list
+# forwarded_from is either None or a list
+def get_http_link_error( link, forwarded_from = None ):
        if forwarded_from == None:
                forwarded_from = []
 
        connection_data = re.search( HTTP_LINK_REGEXP, link )
-       if connection_data:
+       if not connection_data:
+               return None
                hostname = connection_data.group( 'hostname' )
                port = connection_data.group( 'port' )
                resource = connection_data.group( 'resource' )
@@ -95,7 +141,9 @@
                        port = 80
                        
                socketfd = socket_create()
-               if socketfd == None: # if a socket couldn't be created, just 
ignore this link this time.
+       # if a socket couldn't be created,
+       # just ignore this link this time.
+       if socketfd == None:
                        return None
                
                if socket_connect( socketfd, hostname, port ) == False:
@@ -105,7 +153,8 @@
                if resource == None:
                        resource = '/'
 
-               socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' + 
hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
+       socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' \
+                     + hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
                
                webpage = socket_read( socketfd )
                if webpage == None:
@@ -116,7 +165,8 @@
                
                end_of_headers_pos = webpage.find( '\r\n\r\n' )
                if end_of_headers_pos == -1:
-                       return 'couldn\'t find end of headers (possibly no 
content in file)'
+               return 'couldn\'t find end of ' \
+                       + 'headers (possibly no content in file)'
                        
                header_lines = webpage[ : end_of_headers_pos ]
                header_lines = header_lines.split( '\r\n' )
@@ -129,69 +179,76 @@
                        
                # look for forwards
                match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines )
+       if not match:
+               return None
+       # if we haven't been forwarded too many times yet...
+       if len( forwarded_from ) >= FORWARDS_TO_FOLLOW:
+               # we've been forwarded too many times, sorry.
+               return 'too many forwards (over ' \
+                       + str( len( forwarded_from ) ) + ')'
+       match = regexp_search_list(HTTP_NEW_LOCATION_HEADER, header_lines)
                if match:
-                       if len( forwarded_from ) < FORWARDS_TO_FOLLOW: # if we 
haven't been forwarded too many times yet...
-                               match = regexp_search_list( 
HTTP_NEW_LOCATION_HEADER, header_lines )
-                               if match:
-                                       forwarded_from.append( link )
-                                       new_location = match.group( 
'new_location' )
+               forwarded_from.append(link)
+               new_location = match.group('new_location')
                                        if new_location in forwarded_from:
                                                return 'forward loop!'
-                                       else:
-                                               return get_http_link_error( 
new_location, forwarded_from )
-                       else: # we've been forwarded too many times, sorry.
-                               return 'too many forwards (over ' + str( len( 
forwarded_from ) ) + ')'
-
+               return get_http_link_error(new_location, forwarded_from)
        return None
        
-def is_match_inside_comment_block( regexp_match ):
+def is_match_inside_comment( regexp_match ):
        haystack = regexp_match.string
        match_pos = regexp_match.start()
-       comment_block_start = haystack.rfind( '<!--', 0, match_pos )
-       comment_block_end = haystack.rfind( '-->', 0, match_pos )
+       comment_block_start = haystack.rfind('<!--', 0, match_pos)
+       comment_block_end = haystack.rfind('-->', 0, match_pos)
        if comment_block_start > comment_block_end:
                return True
        return False
 
-def regexp_search_list( regexp, the_list ):
+def regexp_search_list(regexp, the_list):
        for list_element in the_list:
-               match = re.search( regexp, list_element )
+               match = re.search(regexp, list_element)
                if match:
                        return match
        return None
 
-def search_directory_for_files( base_directory, directory ):
-       for element_name in os.listdir( os.path.join( base_directory, directory 
) ):
-               relative_path_to_element = os.path.join( directory, 
element_name )
-               full_path_to_element = os.path.join( base_directory, 
relative_path_to_element )
-               if os.path.isdir( full_path_to_element ):
-                       if re.search( EXCLUDED_DIRECTORIES_REGEXP, 
relative_path_to_element ):
+def search_directory_for_files(base_directory, directory):
+       for element_name in os.listdir(os.path.join(base_directory, directory)):
+               relative_path_to_element = os.path.join(directory, element_name)
+               full_path_to_element = os.path.join(base_directory, \
+                                                   relative_path_to_element)
+               if os.path.isdir(full_path_to_element):
+                       if re.search(EXCLUDED_DIRECTORIES_REGEXP, \
+                                    relative_path_to_element):
                                continue
                                
-                       search_directory_for_files( base_directory, 
relative_path_to_element )
+                       search_directory_for_files(base_directory, \
+                                                  relative_path_to_element)
                else: # it's a file
-                       if not re.search( FILENAMES_TO_CHECK_REGEXP, 
element_name ):
+                       if not re.search(FILENAMES_TO_CHECK_REGEXP, \
+                                        element_name):
                                continue
                
-                       if ( SKIP_TRANSLATION_FILES == True ) and re.search( 
TRANSLATION_REGEXP, element_name ):
+                       if (SKIP_TRANSLATION_FILES == True) \
+                            and re.search(TRANSLATION_REGEXP, element_name):
                                continue
 
-                       if re.search( EXCLUDED_FILENAMES_REGEXP, 
relative_path_to_element ):
+                       if re.search(EXCLUDED_FILENAMES_REGEXP, \
+                                     relative_path_to_element):
                                continue
                        
-                       files_to_check.append( relative_path_to_element )
+                       files_to_check.append(relative_path_to_element)
                                
-def socket_connect( socketfd, hostname, port ):
+def socket_connect(socketfd, hostname, port):
        try:
-               socketfd.connect( ( hostname, int( port ) ) )
+               socketfd.connect((hostname, int(port)))
        except socket.error, message:
                return False
        return True
 
 def socket_create():
        try:
-               socketfd = socket.socket( socket.AF_INET, socket.SOCK_STREAM )
-               socketfd.settimeout( SOCKET_TIMEOUT )
+               socketfd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+               socketfd.settimeout(SOCKET_TIMEOUT)
        except socket.error, message:
                return None
        return socketfd
@@ -204,25 +261,36 @@
                return None
        return output
 
+def clear_file( name ):
+       fd = open( name, 'w' )
+       fd.close()
+
 ### OK, main program below.
        
+if REPORT_FILE_PREFIX[-1] != '/':
+       REPORT_FILE_PREFIX += '/'
+
+REPORT_FILE_NAME = REPORT_FILE_PREFIX + REPORT_FILE_NAME
+COMMENTED_FILE_NAME = REPORT_FILE_PREFIX + COMMENTED_FILE_NAME
+
 base_directory = BASE_DIRECTORY
 remote_base_directory = REMOTE_BASE_DIRECTORY
 if remote_base_directory[-1] != '/':
        remote_base_directory += '/'
        
-remote_site_root = remote_base_directory[ : ( remote_base_directory.find( '/', 
( remote_base_directory.find( '://' ) + 3 ) ) + 1 ) ]
+pos = remote_base_directory.find( '://' ) + 3
+pos = remote_base_directory.find( '/', pos) + 1
+remote_site_root = remote_base_directory[ : pos ]
 
 # `cd` to this path
 if not os.path.isdir( base_directory ):
-       print 'Entered base directory isn\'t a directory (or doesn\'t exist at 
all).'
+       print 'Base directory', \
+             "`" + base_directory + "'", 'not found.'
        sys.exit( 1 )
 
-# list files
 print 'Recursively listing all files in the selected directory...'
-search_directory_for_files( base_directory, '' )
+search_directory_for_files( base_directory, '')
 
-# find links
 links_to_check = []
 print 'Listing files done. Looking for links...'
 for file_to_check in files_to_check:
@@ -236,33 +304,41 @@
                
                line_number = -1
                split_file_contents = file_contents.split( '\n' )
-               for checked_line_number, line in enumerate( split_file_contents 
):
-                       checked_line_number += 1 # so that line numbers don't 
start from 0
+               for checked_line_number, line \
+                   in enumerate( split_file_contents ):
+                       checked_line_number += 1 # so that line numbers
+                                                # don't start from 0
                        if line.find( link ) != -1:
                                line_number = checked_line_number
                                break
                
-               link_container = { 'filename': file_to_check, 'line_number': 
line_number, 'link': link, 'is_link_inside_comment_block': 
is_match_inside_comment_block( match ) }
+               link_container = { 'filename': file_to_check, \
+                                  'line_number': line_number, \
+                                  'link': link, \
+                                  'is_inside_comment': \
+                                      is_match_inside_comment( match ) }
                links_to_check.append( link_container )
 
-# check links
 print 'Alright, I\'ve got all the links. Let\'s check them now!'
 
-# open report files for writing
-report_file = open( REPORT_FILE_NAME, 'w' )
+report_file = REPORT_FILE_NAME
+clear_file( report_file )
+commented_file = COMMENTED_FILE_NAME
+clear_file( commented_file )
 translation_report_files = {}
 
 number_of_links_to_check = str( len( links_to_check ) )
 already_checked_links = []
 for i, link_container in enumerate( links_to_check ):
        if i % 10 == 0:
-               print '\rChecking link ' + str( i + 1 ) + ' of ' + 
number_of_links_to_check + '...',
+               print '\rChecking link ' + str( i + 1 ) + ' of ' \
+                     + number_of_links_to_check + '...',
                sys.stdout.flush()
 
        filename = link_container['filename']
        line_number = link_container['line_number']
        link = link_container['link']
-       is_link_inside_comment_block = 
link_container['is_link_inside_comment_block']
+       is_inside_comment = link_container['is_inside_comment']
 
        link_type = None
 
@@ -280,24 +356,22 @@
                        link = remote_site_root + link[1:]
                else:
                        link_type = 'http'
-                       
-                       last_slash_pos = filename.rfind( '/' )
-                       subdirectory = ''
-                       if last_slash_pos != -1:
-                               subdirectory = filename[ : last_slash_pos ] + 
'/'
-                               
-                       link = remote_base_directory + subdirectory + link
+                       subdir = ''
+                       pos = filename.rfind( '/' )
+                       if pos != -1:
+                               subdir = filename[: pos] + '/'
+                       link = remote_base_directory + subdir + link
                        
        link_already_checked = False
-       link_id_if_already_checked = -1
+       link_id = -1
        for i, already_checked_link in enumerate( already_checked_links ):
                if link == already_checked_link['link']:
                        link_already_checked = True
-                       link_id_if_already_checked = i
+                       link_id = i
                        break
                        
        if link_already_checked:
-               link_error = 
already_checked_links[link_id_if_already_checked]['error']
+               link_error = already_checked_links[link_id]['error']
        else:
                link_error = None
                for i in range( NUMBER_OF_ATTEMPTS ):
@@ -310,35 +384,44 @@
                                if link_error == None:
                                        break
                        else:
-                               break # ignore the link, since its protocol is 
unsupported
+                               break # ignore the link,
+                                     # since its protocol is unsupported
                        
-                       if DELAY_BETWEEN_RETRIES != 0:
+                       if DELAY_BETWEEN_RETRIES > 0:
                                time.sleep( DELAY_BETWEEN_RETRIES )
                                
-               already_checked_links.append( { 'link': link, 'error': 
link_error } )
+               already_checked_links.append( { 'link': link, \
+                                               'error': link_error } )
+
+       # Report working links inside comments so that webmasters
+       # could uncomment them.
+       if link_error == None and is_inside_comment:
+               link_error = 'no error detected'
                
        if link_error != None:
-               if is_link_inside_comment_block:
+               if is_inside_comment:
                        link_error += ' (link commented out)'
+                       file_to_write = commented_file
+                       postfix = '/c'
+               else:
+                       file_to_write = report_file
+                       postfix = '/0'
        
-               if not SKIP_TRANSLATION_FILES:
                        match = re.search( TRANSLATION_REGEXP, filename )
-                       if match and SEPARATE_TRANSLATION_REPORTS:
+               if match:
                                langcode = match.group( 'langcode' )
-                               if langcode not in translation_report_files:
-                                       translation_report_files[langcode] = 
open( REPORT_FILE_NAME + '-' + langcode, 'w' )
-                               translation_report_files[langcode].write( 
format_error( filename, line_number, link, link_error ) )
-                       else:
-                               report_file.write( format_error( filename, 
line_number, link, link_error ) )
-               else:
-                       report_file.write( format_error( filename, line_number, 
link, link_error ) )
+                       file_idx = langcode + postfix
+                       if file_idx not in translation_report_files:
+                               translation_report_files[file_idx] = \
+                                 file_to_write + '-' + langcode
+                               clear_file(translation_report_files[file_idx])
+                       file_to_write = translation_report_files[file_idx]
+               fd = open(file_to_write, 'a')
+               fd.write(format_error(filename, line_number, \
+                                              link, link_error))
+               fd.close()
        
-       if DELAY_BETWEEN_CHECKS != 0:
+       if DELAY_BETWEEN_CHECKS > 0:
                time.sleep( DELAY_BETWEEN_CHECKS )
                
-for langcode in translation_report_files:
-       translation_report_files[langcode].close()
-report_file.close()
-
 print '\nDone! :-)'
-
[Prev in Thread]
Current Thread
[Next in Thread]
www/server/source/linc linc.py, Pavel Kharitonov <=
- www/server/source/linc linc.py, Pavel Kharitonov, 2012/12/15
- www/server/source/linc linc.py, Pavel Kharitonov, 2012/12/22
Prev by Date: www home.ja.html planetfeeds.ja.html education/...
Next by Date: www/server/gnun gnun.mk
Previous by thread: www home.ja.html planetfeeds.ja.html education/...
Next by thread: www/server/source/linc linc.py
Index(es):
- Date
- Thread