www-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

www/server/source/linc linc.py


From: Pavel Kharitonov
Subject: www/server/source/linc linc.py
Date: Tue, 12 Mar 2013 05:53:00 +0000

CVSROOT:        /web/www
Module name:    www
Changes by:     Pavel Kharitonov <ineiev>       13/03/12 05:53:00

Modified files:
        server/source/linc: linc.py 

Log message:
        New options to control program reaction on irregularities: `-g', `-w'.
        Use new function, `report', for diagnostics.
        Add initial support for symlinks.
        Rewrite file parsing, fix line number calculation.
        When checking, iterate the links by unique URLs rather than by
        their occurrences in the files.
        Save link type in a container rather than compute it in every cycle.
        Clear reports for English files on the first write, consistently with
        the behavior for reports about translations.
        Make `report_files' a list rather than a dictionary.
        (get_http_link_error): Don't split header into lines, use its whole
        text to search for the messages.
        (regexp_search_list): Remove the function.
        (load_cache): Return a dictionary instead of a list.

CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.12&r2=1.13

Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -b -r1.12 -r1.13
--- linc.py     18 Feb 2013 17:58:28 -0000      1.12
+++ linc.py     12 Mar 2013 05:52:59 -0000      1.13
@@ -18,17 +18,18 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-# defines
-
-LINC_VERSION = 'LINC 0.8'
+LINC_VERSION = 'LINC 0.9'
+USAGE = \
+'''Usage: %prog [options] [BASE_DIRECTORY]
+Check links in HTML files from BASE_DIRECTORY.'''
 COPYRIGHT= \
-'Copyright (C) 2011-2012 Waclaw Jacek\n\
-Copyright (C) 2013 Free Software Foundation, Inc.\n\
-License GPLv3+: GNU GPL version 3 or later 
<http://gnu.org/licenses/gpl.html>\n\
-This is free software: you are free to change and redistribute it.\n\
-There is NO WARRANTY, to the extent permitted by law.\n\
-\n\
-Written by Waclaw Jacek.'
+'''Copyright (C) 2011-2012 Waclaw Jacek
+Copyright (C) 2013 Free Software Foundation, Inc.
+License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+
+Written by Waclaw Jacek.'''
 
 BASE_DIRECTORY = ''
 REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
@@ -78,8 +79,6 @@
 
 CACHE = None
 
-# regexp-related defines
-
 # Matching directories will not be entered to check their
 # files or subdirectories.
 EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwin|education/fr|\
@@ -88,23 +87,30 @@
   '^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
 
 FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked.
+SYMLINKS_FILENAME = '.symlinks'
 
 FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?'
 
 # What to treat as a HTTP error header.
-HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) '
-HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$'
+HTTP_ERROR_HEADER = '(^|\r\n)HTTP/1\.1 (?P<http_error_code>403|404) '
+HTTP_FORWARD_HEADER = \
+  '(^|\r\n)HTTP/1\.1 (301 Moved Permanently|302 Found)(\r\n|$)'
 HTTP_LINK_REGEXP = \
   'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
-HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
-# Links like href="mailto:..."; and href="irc:..." are excluded.
-LINK_REGEXP = '<a(\s.+?)?\shref="(?P<link>[^"]+)"(\s.+?)?>'
+HTTP_NEW_LOCATION_HEADER = '(^|\r\n)Location: (?P<new_location>.+)(\r\n|$)'
+LINK_BEGIN = '(?i)(<a\s[^<]*)'
+# We want to parse links like href="URL" as well as href='URL';
+# I failed to compose a single regexp for that -- ineiev.
+LINK_REGEXP = \
+[ '(?is)^<a(\s.+?)?\shref="(?P<link>[^"]+)"(\s.+?)?>',
+  "(?is)^<a(\s.+?)?\shref='(?P<link>[^']+)'(\s.+?)?>"]
 TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
+SYMLINK_REGEXP='^\s*(?P<to>[^\s]*)\s+(?P<from>[^\s]*).*$'
 # Don't report against commented out link to README.translations.html
 LINK_TO_SKIP = '/server/standards/README.translations.html'
 
 VERBOSE = 0
-# libraries
+WICKED = 0
 
 import os
 import re
@@ -113,21 +119,22 @@
 import time
 from optparse import OptionParser
 
-# global variables
-
-files_to_check = []
+symlinks = {}
 remote_site_root = None
 remote_base_directory = None
 
-# functions
+def report(level, msg):
+       if VERBOSE > level:
+               print msg
+
+def format_error(symlink, filename, line_number, link, error_message):
+       if symlink != None:
+               filename += ' <- ' + symlink
+       return filename + ':' + line_number + ': ' \
+              + link.replace(' ', '%20') + ' ' + error_message + '\n'
        
-def format_error( filename, line_number, link, error_message ):
-       return str( filename ) + ':' + str( line_number ) + ': ' \
-              + str( link ).replace( ' ', '%20' ) + ' ' \
-              + str( error_message ) + '\n'
-       
-def get_ftp_link_error( link ):
-       connection_data = re.search( FTP_LINK_REGEXP, link )
+def get_ftp_link_error(link):
+       connection_data = re.search(FTP_LINK_REGEXP, link)
        if not connection_data:
                return None
        hostname = connection_data.group('hostname')
@@ -150,7 +157,7 @@
        return None
 
 # forwarded_from is either None or a list
-def get_http_link_error( link, forwarded_from = None ):
+def get_http_link_error(link, forwarded_from = None):
        if forwarded_from == None:
                forwarded_from = []
 
@@ -187,89 +194,226 @@
        
        socketfd.close()
        
-       end_of_headers_pos = webpage.find( '\r\n\r\n' )
-       if end_of_headers_pos == -1:
-               if VERBOSE > 1:
-                       print 'No end of headers found on webpage (link ' \
-                        + link + ')'
-                       print '- - - - -'
-                       print webpage
-                       print '- - - - -'
+       end_of_headers = webpage.find('\r\n\r\n')
+       if end_of_headers == -1:
+               report(1, 'No end of headers found on webpage (link ' \
+                        + link + ')')
+               report(1, '- - - - -')
+               report(1, webpage)
+               report(1, '- - - - -')
                return 'couldn\'t find end of ' \
                        + 'headers (possibly no content in file)'
 
-       header_lines = webpage[ : end_of_headers_pos ]
-       header_lines = header_lines.split( '\r\n' )
-       
+       header = webpage[:end_of_headers]
        # search for errors
-       match = regexp_search_list( HTTP_ERROR_HEADER, header_lines )
+       match = re.search(HTTP_ERROR_HEADER, header)
        if match:
-               http_error_code = match.group( 'http_error_code' )
+               http_error_code = match.group('http_error_code')
                return 'http error ' + http_error_code + ' returned by server'
                
        # look for forwards
-       match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines )
+       match = re.search(HTTP_FORWARD_HEADER, header)
        if not match:
                return None
        # if we haven't been forwarded too many times yet...
-       if len( forwarded_from ) >= FORWARDS_TO_FOLLOW:
+       if len(forwarded_from) >= FORWARDS_TO_FOLLOW:
                # we've been forwarded too many times, sorry.
                return 'too many forwards (over ' \
-                       + str( len( forwarded_from ) ) + ')'
-       match = regexp_search_list(HTTP_NEW_LOCATION_HEADER, header_lines)
-       if match:
+                       + str(FORWARDS_TO_FOLLOW) + ')'
+       match = re.search(HTTP_NEW_LOCATION_HEADER, header)
+       if not match:
+               return None
                forwarded_from.append(link)
                new_location = match.group('new_location')
                if new_location in forwarded_from:
                        return 'forward loop!'
                return get_http_link_error(new_location, forwarded_from)
-       return None
        
-def is_match_inside_comment( regexp_match ):
-       haystack = regexp_match.string
-       match_pos = regexp_match.start()
-       comment_block_start = haystack.rfind('<!--', 0, match_pos)
-       comment_block_end = haystack.rfind('-->', 0, match_pos)
-       if comment_block_start > comment_block_end:
-               if haystack[comment_block_start + len('<!--')] == '#':
+def is_inside_comment(head):
+       start = head.rfind('<!--')
+       if start <= head.rfind('-->'):
+               return 'no'
+       n = len('<!--') + start
+       if len(head) <= n:
+               return 'yes'
+       if head[n] != '#':
+               return 'yes'
                        return 'ssi'
+
+# Current symlinks processing doesn't take into account parallel
+# links to the translations; instead, the translators should maintain
+# URLs in sync with the originals.
+def load_symlinks(root, directory, path):
+       report(1, 'Found symlinks file `' + path + "'.")
+       try:
+               f = open(os.path.join(root, path), 'r')
+       except IOError:
+               report(-3, "Failed to read symlinks file `" + path + "'.")
+               return
+       lines = f.read().splitlines()
+       f.close()
+       if not directory in symlinks:
+               symlinks[directory] = {}
+       for i, l in enumerate(lines):
+               # Skip empty lines and comments.
+               if re.search('^\s*(#|$)', l):
+                       continue
+               match = re.search(SYMLINK_REGEXP, l)
+               if not match:
+                       report(-3, path + ":" + str(i + 1) \
+                               + ": Couldn't parse symlink `" + l + "'.")
+                       continue
+               source = match.group('from')
+               dest = match.group('to')
+               report(2, path + ":" + str(i + 1) + ": Symlink `" \
+                       + dest + "' <- `" + source + "' found.")
+               symlinks[directory][source] = dest
+
+def classify_link(filename, link, symlink = None):
+       link_type = 'http'
+       # When we process a symlinked file, we use the directory
+       # from which it is linked rather than the actual location
+       # of the file.
+       dir_name = symlink if (symlink != None) else filename
+       if re.search('^(mailto:|irc://|https://)', link):
+               link_type = 'unsupported'
+       elif link.find('http://') == 0:
+               link_type = 'http'
+       elif link.find('ftp://') == 0:
+               link_type = 'ftp'
+       elif link[0] == '/':
+               link = remote_site_root + link[1:]
                else:
-                       return 'yes'
-       return 'no'
+               subdir = ''
+               pos = dir_name.rfind('/')
+               if pos != -1:
+                       subdir = dir_name[: pos] + '/'
+               link = remote_base_directory + subdir + link
+       return [link_type, link]
 
-def regexp_search_list(regexp, the_list):
-       for list_element in the_list:
-               match = re.search(regexp, list_element)
-               if match:
-                       return match
-       return None
+def scan_file(root, file_to_check, symlink = None):
+       path = os.path.join(root, file_to_check)
+       fd = open(path, 'r')
+       text = fd.read()
+       fd.close()
 
-def search_directory_for_files(base_directory, directory):
-       for element_name in os.listdir(os.path.join(base_directory, directory)):
-               relative_path_to_element = os.path.join(directory, element_name)
-               full_path_to_element = os.path.join(base_directory, \
-                                                   relative_path_to_element)
-               if os.path.isdir(full_path_to_element):
-                       if re.search(EXCLUDED_DIRECTORIES_REGEXP, \
-                                    relative_path_to_element):
+       lines = 1
+       head = ''
+       for chunk in re.split(LINK_BEGIN, text):
+               line_no = lines 
+               if chunk == None:
+                       continue
+               commented = is_inside_comment(head)
+               lines += chunk.count('\n')
+               head += chunk
+               match = None
+               for regexp in LINK_REGEXP:
+                       match = re.search(regexp, chunk)
+                       if match != None:
+                               break
+               if not match:
                                continue
-                               
-                       search_directory_for_files(base_directory, \
-                                                  relative_path_to_element)
-               else: # it's a file
-                       if not re.search(FILENAMES_TO_CHECK_REGEXP, \
-                                        element_name):
+               [link_type, url] = classify_link(file_to_check, \
+                                                 match.group('link'), symlink)
+               if link_type == 'unsupported':
                                continue
+               links_to_check.append({'symlink': symlink, 'URL': url,
+                                      'filename': file_to_check,
+                                      'line_number': str(line_no),
+                                      'link': match.group('link'),
+                                      'is_inside_comment': commented})
+               if not url in urls_to_check:
+                       urls_to_check[url] = \
+                                        {'link': match.group('link'),
+                                         'is_inside_comment': commented,
+                                         'type': link_type}
+
+def scan_directory(root, directory):
+       for element_name in os.listdir(os.path.join(root, directory)):
+               relative_path = os.path.join(directory, element_name)
+               full_path = os.path.join(root, relative_path)
+               if os.path.isdir(full_path):
+                       if not re.search(EXCLUDED_DIRECTORIES_REGEXP, \
+                                    relative_path):
+                               scan_directory(root, relative_path)
+                       continue
+               if SYMLINKS_FILENAME == element_name:
+                       load_symlinks(root, directory, relative_path)
+                       continue
+               if not re.search(FILENAMES_TO_CHECK_REGEXP, element_name):
+                       continue
+               if re.search(EXCLUDED_FILENAMES_REGEXP, relative_path):
+                       continue
+               scan_file(root, relative_path)
+
+def get_symlink_target(root, directory, destination, depth = 0):
+       if depth > 17:
+               report(-2, 'Too deep symlink.')
+               if WICKED > 1:
+                       print 'Aborting due to a deep symlink.'
+                       exit(1)
+               return None
+       # Skip external links.
+       # TODO: check it as a link.
+       if re.search('^(ftp|http(s?))://', destination):
+               report(2, 'External symlink found.')
+               return None
                
-                       if (SKIP_TRANSLATION_FILES == True) \
-                            and re.search(TRANSLATION_REGEXP, element_name):
-                               continue
+       report(2, 'Getting target of `' + directory + "'/`" \
+                       + destination + "', depth=" + str(depth))
+       path = os.path.join(directory, destination)
+       if path[0] == '/':
+               path = path[1:]
+       pos = path.rfind('/')
+       if pos == -1:
+               dest = path
+               new_dir = ""
+       else:
+               dest = path[pos + 1:]
+               new_dir = path[:pos]
+       while re.search('[^/]*/../', new_dir):
+               new_dir = re.sub('[^/]*/../', '', new_dir)
+       if os.path.exists(os.path.join(root, path)):
+               if os.path.isdir(os.path.join(root, path)):
+                       report(2, 'Symlinked directory `' + path + "' found.")
+                       if re.search(EXCLUDED_DIRECTORIES_REGEXP, new_dir):
+                               report(2, 'Symlinked directory excluded.')
+                               return None
+                       return get_symlink_target(root, new_dir,
+                                'index.html', depth + 1)
+               report(2, 'Symlinked file found.')
+               if re.search(EXCLUDED_FILENAMES_REGEXP, path):
+                       report(2, 'Symlinked file excluded.')
+                       return None
+               return path
+       report(2, 'No `' + path + "' file exists.")
+       report(2, 'Searching for symlinks in `' + path + "'")
+       if new_dir in symlinks:
+               report(2, 'Searching for symlink `' + dest + "'")
+               if dest in symlinks[new_dir]:
+                       report(2, 'Symlink found: `' \
+                               + symlinks[new_dir][dest] + "'")
+                       return get_symlink_target(root, new_dir,
+                                                 symlinks[new_dir][dest],
+                                                 depth + 1)
+       report(-2, 'Blind symlink found.')
+       if WICKED > 1:
+               print 'Aborting due to a blind symlink.'
+               exit(1)
+       return None
 
-                       if re.search(EXCLUDED_FILENAMES_REGEXP, \
-                                     relative_path_to_element):
+def process_symlinks(root):
+       for directory in symlinks:
+               for source in symlinks[directory]:
+                       if not re.search(FILENAMES_TO_CHECK_REGEXP, source):
+                               continue
+                       dest = symlinks[directory][source]
+                       report(2, 'Trying `' + directory + "'/`" \
+                               + source + "' -> `" + dest + "'..")
+                       destination = get_symlink_target(root, directory, dest)
+                       if destination == None:
                                continue
-                       
-                       files_to_check.append(relative_path_to_element)
+                       scan_file(root, destination, os.path.join(directory, 
source))
                                
 def socket_connect(socketfd, hostname, port):
        try:
@@ -294,8 +438,8 @@
                return None
        return output
 
-def clear_file( name ):
-       fd = open( name, 'w' )
+def clear_file(name):
+       fd = open(name, 'w')
        fd.close()
 
 def show_usage(option, opt, value, parser):
@@ -307,67 +451,42 @@
        print COPYRIGHT
        exit(0)
 
-def classify_link(filename, link):
-       link_type = 'http'
-       if re.search('^(mailto:|irc://|https://)', link):
-               link_type = 'unsupported'
-       elif link.find('http://') == 0:
-               link_type = 'http'
-       elif link.find('ftp://') == 0:
-               link_type = 'ftp'
-       elif link[0] == '/':
-               link = remote_site_root + link[1:]
-       else:
-               subdir = ''
-               pos = filename.rfind('/')
-               if pos != -1:
-                       subdir = filename[: pos] + '/'
-               link = remote_base_directory + subdir + link
-       return [link_type, link]
-
 def load_cache(cache):
        if cache == None:
-               if VERBOSE > 2:
-                       print "No cache file is loaded."
-               return []
+               report(2, "No cache file is loaded.")
+               return {}
        try:
                f = open(cache, 'r')
        except IOError:
-               if VERBOSE > -3:
-                       print "Failed to read cache file `" + cache + "'."
-               return []
-       if VERBOSE > 2:
-               print "Loading cache file `" + cache +"'."
-       links = f.read().splitlines()
+               report(-3, "Failed to read cache file `" + cache + "'.")
+               return {}
+       report(2, "Loading cache file `" + cache +"'.")
+       text = f.read()
        f.close();
-       if VERBOSE > 2:
-               print "Loaded links: " + str(len(links))
-       return links
+       retval = {}
+       for link in text.splitlines():
+               retval[link] = None
+       report(2, "Loaded links: " + str(len(retval)))
+       return retval
 
 def save_cache(cache, checked_links):
        if cache == None:
-               if VERBOSE > 2:
-                       print "No cache file is saved."
+               report(2, "No cache file is saved.")
                return
        try:
                f = open(cache, 'w')
        except IOError:
-               if VERBOSE > -3:
-                       print "Failed to write cache file `" + cache + "'."
+               report(-3, "Failed to write cache file `" + cache + "'.")
                return
-       if VERBOSE > 2:
-               print "\nSaving cache file `" + cache +"'."
+       report(2, "Saving cache file `" + cache +"'.")
        for link in checked_links:
                # Links containing a newline are not cached
                # because newline is used in cache as the separator.
-               if link['error'] == None and link['link'].find('\n') == -1:
-                       f.write(link['link'] + '\n')
+               if checked_links[link] == None and link.find('\n') == -1:
+                       f.write(link + '\n')
        f.close()
 
-usage = \
-'Usage: %prog [options] [BASE_DIRECTORY]\n\
-Check links in HTML files from BASE_DIRECTORY.'
-parser = OptionParser(usage = usage, add_help_option = False)
+parser = OptionParser(usage = USAGE, add_help_option = False)
 
 parser.add_option('-a', '--attempts', dest = 'attempts', type = 'int',
                  metavar = 'N',
@@ -383,6 +502,8 @@
                  metavar = 'N',
                   help = 'maximum number of forwards to follow [' \
                         + str(FORWARDS_TO_FOLLOW) + ']')
+parser.add_option('-g', '--good', dest = 'good', action = 'count',
+                 help = "be more good")
 parser.add_option('-l', '--local', dest = 'local', action = 'store_true',
                  default = False,
                  help = "don't download files, assume no error")
@@ -412,6 +533,8 @@
                         + EXCLUDED_DIRECTORIES_REGEXP + ']')
 parser.add_option('-v', '--verbose', dest = 'verbose', action = 'count',
                  help = "be more verbose")
+parser.add_option('-w', '--wicked', dest = 'wicked', action = 'count',
+                 help = "be more wicked")
 parser.add_option('-h', '-?', '--help', action = 'callback',
                  callback = show_usage, help = 'display this help and exit')
 parser.add_option('-V', '--version', action = 'callback',
@@ -438,7 +561,10 @@
        VERBOSE -= options.quiet
 if options.verbose != None:
        VERBOSE += options.verbose
-
+if options.good != None:
+       WICKED -= options.good
+if options.wicked != None:
+       WICKED += options.wicked
 if options.attempts != None:
        NUMBER_OF_ATTEMPTS = options.attempts
 if options.check_delay != None:
@@ -478,204 +604,154 @@
 REPORT_FILE_NAME = REPORT_FILE_PREFIX + REPORT_FILE_NAME
 COMMENTED_FILE_NAME = REPORT_FILE_PREFIX + COMMENTED_FILE_NAME
 
-if VERBOSE > 0:
-       print "Base directory:       `" + BASE_DIRECTORY + "'"
-       print "Cache file:           " + \
-               ("`" + CACHE + "'" if CACHE else "(None)")
-       print "Number of attempts:    " + str(NUMBER_OF_ATTEMPTS)
-       print "Delay between checks:  " + str(DELAY_BETWEEN_CHECKS)
-       print "Delay between retries: " + str(DELAY_BETWEEN_RETRIES)
-       print "Socket timeout:        " + str(SOCKET_TIMEOUT)
-       print "Forwards to follow:    " + str(FORWARDS_TO_FOLLOW)
-       print "Skip translations:     " + str(SKIP_TRANSLATION_FILES)
-       print "Report to directory:  `" + REPORT_FILE_PREFIX + "'"
-       print "Base URL:             `" + REMOTE_BASE_DIRECTORY + "'"
-       print "Excluded files:       `" + EXCLUDED_FILENAMES_REGEXP + "'"
-       print "Excluded directories: `" + EXCLUDED_DIRECTORIES_REGEXP + "'"
-       print "Run locally:           " + str(LOCAL)
-       print "Verbosity:             " + str(VERBOSE)
-
-# `cd` to this path
-if not os.path.isdir( base_directory ):
-       if VERBOSE > -3:
-               print 'Base directory', \
-                       "`" + base_directory + "'", 'not found.' >> stderr
-       exit(1)
+report(0, "Base directory:       `" + BASE_DIRECTORY + "'")
+report(0, "Cache file:           " + \
+       ("`" + CACHE + "'" if CACHE else "(None)"))
+report(0, "Number of attempts:    " + str(NUMBER_OF_ATTEMPTS))
+report(0, "Delay between checks:  " + str(DELAY_BETWEEN_CHECKS))
+report(0, "Delay between retries: " + str(DELAY_BETWEEN_RETRIES))
+report(0, "Socket timeout:        " + str(SOCKET_TIMEOUT))
+report(0, "Forwards to follow:    " + str(FORWARDS_TO_FOLLOW))
+report(0, "Skip translations:     " + str(SKIP_TRANSLATION_FILES))
+report(0, "Report to directory:  `" + REPORT_FILE_PREFIX + "'")
+report(0, "Base URL:             `" + REMOTE_BASE_DIRECTORY + "'")
+report(0, "Excluded files:       `" + EXCLUDED_FILENAMES_REGEXP + "'")
+report(0, "Excluded directories: `" + EXCLUDED_DIRECTORIES_REGEXP + "'")
+report(0, "Run locally:           " + str(LOCAL))
+report(0, "Verbosity:             " + str(VERBOSE))
+report(0, "Wickedness:            " + str(WICKED))
 
-if VERBOSE >= 0:
-       print 'Recursively listing all files in the selected directory...'
-search_directory_for_files( base_directory, '')
+if not os.path.isdir(base_directory):
+       report(-3, "Base directory `" + base_directory + "' not found.")
+       exit(1)
 
+report(-1, 'Recursively listing all files in the selected directory...')
 links_to_check = []
-if VERBOSE >= 0:
-       print 'Looking for links...'
-for file_to_check in files_to_check:
-       path_to_file = os.path.join( base_directory, file_to_check )
-       fd = open( path_to_file, 'r' )
-       file_contents = fd.read()
-       fd.close()
-
-       for match in re.finditer(LINK_REGEXP, file_contents, re.IGNORECASE):
-               link = match.group('link')
-               line_number = -1
-               split_file_contents = file_contents.split( '\n' )
-               for checked_line_number, line \
-                   in enumerate( split_file_contents ):
-                       checked_line_number += 1 # so that line numbers
-                                                # don't start from 0
-                       if line.find( link ) != -1:
-                               line_number = checked_line_number
-                               break
-               
-               link_container = { 'filename': file_to_check, \
-                                  'line_number': line_number, \
-                                  'link': link, \
-                                  'is_inside_comment': \
-                                      is_match_inside_comment( match ) }
-               links_to_check.append( link_container )
-
-number_of_links_to_check = str( len( links_to_check ) )
-already_checked_links = []
-cached_links = load_cache(CACHE)
-for link in cached_links:
-       already_checked_links.append({'link': link, 'error': None})
+urls_to_check = {}
+scan_directory(base_directory, '')
+report(-1, 'Processing symlinks...')
+process_symlinks(base_directory)
+checked_urls = load_cache(CACHE)
+unique_links = str(len(urls_to_check))
+cached_links = 0
+for link in checked_urls:
+       if link in urls_to_check:
+               del urls_to_check[link]
+               cached_links += 1
+report(-1, 'Checking links...')
 for j in range(NUMBER_OF_ATTEMPTS):
-    if VERBOSE > -2:
-       print 'Pass ' + str(j + 1) + ' of ' + str(NUMBER_OF_ATTEMPTS) + ':'
-    for i, link_container in enumerate( links_to_check ):
+    report(-2, 'Pass ' + str(j + 1) + ' of ' + str(NUMBER_OF_ATTEMPTS) + ':')
+    next_urls_to_check = urls_to_check.copy()
+    number_of_links_to_check = str(len(urls_to_check))
+    for i, url in enumerate(urls_to_check):
+       link_container = urls_to_check[url]
        if (i % 10 == 0 and VERBOSE > -2):
                print '\rChecking link ' + str(i + 1) + ' of ' \
                      + number_of_links_to_check + '...',
                sys.stdout.flush()
 
-       filename = link_container['filename']
        link = link_container['link']
+       # BTW, shouldn't we check whether the named fragment
+       # is present on the page?
        if link_container['is_inside_comment'] == 'ssi' or link[0] == '#':
                continue
-
-       if VERBOSE > 2 and link[0] != '/' and not re.search('^[^/]*:', link):
-               print '\n' + filename + ':' \
-                     + str(link_container['line_number']) + ': link ' \
-                     + str(i) + ' `' + link + "' is relative"
-
-       [link_type, link] = classify_link(filename, link)
-       if link_type == 'unsupported':
-               continue
-
-       link_id = -1
        link_error = None
-       for i, checked_link in enumerate(already_checked_links):
-               if link == checked_link['link']:
-                       link_id = i
-                       link_error = already_checked_links[link_id]['error']
-                       break
-       checked_link = None
-       if link_id > -1:
-               if already_checked_links[link_id]['error'] == None:
+       if url in checked_urls:
+               link_error = checked_urls[url]
+               if link_error == None:
+                       del next_urls_to_check[url]
                        continue
-               checked_link = already_checked_links[link_id]
+       link_type = link_container['type']
        if LOCAL:
                link_error = None
        elif link_type == 'ftp':
-               link_error = get_ftp_link_error( link )
+               link_error = get_ftp_link_error(url)
        elif link_type == 'http':
-               link_error = get_http_link_error( link )
+               link_error = get_http_link_error(url)
        else:
+               report(1, 'Unexpected link type `' + link_type + "' found.")
+               if WICKED > 0:
+                       print 'Aborting due to an unexpected link type.'
+                       exit(1)
                continue
-       if checked_link != None:
+       checked_urls[url] = link_error
                if link_error == None:
-                       already_checked_links[link_id]['error'] = None
-       else:
-               already_checked_links.append( { 'link': link, \
-                                       'error': link_error } )
+               del next_urls_to_check[url]
        if DELAY_BETWEEN_CHECKS > 0:
                time.sleep(DELAY_BETWEEN_CHECKS)
-    save_cache(CACHE, already_checked_links)
-    if VERBOSE > -2:
+    urls_to_check = next_urls_to_check
+
+    save_cache(CACHE, checked_urls)
        broken_so_far = 0
-       for checked_link in already_checked_links:
-               if checked_link['error'] != None:
+    for i, link in enumerate(checked_urls):
+       err = checked_urls[link]
+       if err != None:
                        broken_so_far = broken_so_far + 1
-       print '\n' + str(len(already_checked_links)) + ' unique links, ' \
-             + str(broken_so_far) + ' seem broken'
-       if VERBOSE > 1:
-               for i, checked_link in enumerate(already_checked_links):
-                       if checked_link['error'] != None or VERBOSE > 2:
-                               print 'link ' + str(i) + ': ' \
-                               + checked_link['link'] + ': ' \
-                               + (checked_link['error'] \
-                                    if checked_link['error'] else '(no error)')
+       if (VERBOSE > 1 and err != None) or VERBOSE > 2:
+               print 'link ' + str(i) + ': ' + link \
+                       + ': ' + (err if err else '(no error)')
+    report(-2, '\n' + str(len(links_to_check)) + ' links, ' \
+             + unique_links + ' unique, ' \
+             + str(cached_links) + ' cached, ' \
+             + str(broken_so_far) + ' seem broken')
        if broken_so_far == 0:
-               print 'No more broken links; skipping the rest passes (if any)'
+       report(-2, 'No more broken links; skipping the rest passes (if any)')
                break
     if j < NUMBER_OF_ATTEMPTS - 1 and DELAY_BETWEEN_RETRIES > 0:
        time.sleep(DELAY_BETWEEN_RETRIES)
 
-if VERBOSE >= 0:
-       print 'Writing reports...'
+report(-1, 'Writing reports...')
 
 report_file = REPORT_FILE_NAME
-clear_file( report_file )
 commented_file = COMMENTED_FILE_NAME
-clear_file( commented_file )
-report_files = {}
+report_files = []
 
 for i, link_container in enumerate(links_to_check):
        filename = link_container['filename']
        line_number = link_container['line_number']
+       url = link_container['URL']
        link = link_container['link']
-       is_inside_comment = link_container['is_inside_comment']
-       if is_inside_comment == 'ssi' or link[0] == '#':
+       commented = link_container['is_inside_comment']
+       if commented == 'ssi' or link[0] == '#':
                continue
-       if link_container['is_inside_comment'] == 'yes' \
-               and link == LINK_TO_SKIP:
-               if VERBOSE > 2 :
-                       print 'Skipping link `' + LINK_TO_SKIP + "'"
+       if commented == 'yes' and link == LINK_TO_SKIP:
+               report(2, 'Skipping link `' + LINK_TO_SKIP + "'")
                continue
 
-       [link_type, link] = classify_link(filename, link)
+       if link[0] != '/' and not re.search('^[^/]*:', link):
+               report(2, filename + ':' + line_number \
+                     + ': link ' + str(i) + ' `' + link + "' is relative")
                        
-       if link_type == 'unsupported':
-               if VERBOSE > 2:
-                       print 'Note: link `' + link + "' is not supported."
-               continue
-       link_id = -1
-       for i, checked_link in enumerate(already_checked_links):
-               if link == checked_link['link']:
-                       link_id = i
-                       link_error = already_checked_links[link_id]['error']
-                       break
-       if link_id == -1:
-               print 'Unchecked link detected'
+       if url in checked_urls:
+               link_error = checked_urls[url]
+       else:
+               report(-3, filename + ':' + line_number \
+                        + ': Unchecked link `' + url + "' detected.")
+               if WICKED > 0:
+                       print 'Aborting due to an unchecked link.'
+                       exit(1)
                continue
-       checked_link = already_checked_links[link_id]
        # Report working links inside comments so that webmasters
        # could uncomment them.
-       if link_error == None and is_inside_comment == 'yes':
+       if link_error == None and commented == 'yes':
                link_error = 'no error detected'
-
-       if link_error != None:
-               if is_inside_comment == 'yes':
+       if link_error == None:
+               continue
+       file_prefix = report_file
+       if commented == 'yes':
                        link_error += ' (link commented out)'
-                       file_to_write = commented_file
-                       postfix = '/c'
-               else:
-                       file_to_write = report_file
-                       postfix = '/0'
-
+               file_prefix = commented_file
+       langcode = ''
                match = re.search(TRANSLATION_REGEXP, filename)
                if match:
-                       langcode = match.group('langcode')
-                       file_idx = langcode + postfix
-                       if file_idx not in report_files:
-                               report_files[file_idx] = \
-                                 file_to_write + '-' + langcode
-                               clear_file(report_files[file_idx])
-                       file_to_write = report_files[file_idx]
+               langcode = '-' + match.group('langcode')
+       file_to_write = file_prefix + langcode
+       if file_to_write not in report_files:
+               clear_file(file_to_write)
+               report_files.append(file_to_write)
                fd = open(file_to_write, 'a')
-               fd.write(format_error(filename, line_number, \
-                                       link, link_error))
+       fd.write(format_error(link_container['symlink'], filename, \
+                               line_number, url, link_error))
                fd.close()
 
-if VERBOSE >= 0:
-       print 'Done!'
+report(-1, 'Done!')



reply via email to

[Prev in Thread] Current Thread [Next in Thread]