www/server/source/linc linc.py

www-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py

From:	Pavel Kharitonov
Subject:	www/server/source/linc linc.py
Date:	Mon, 18 Feb 2013 15:29:57 +0000
CVSROOT:        /web/www
Module name:    www
Changes by:     Pavel Kharitonov <ineiev>       13/02/18 15:29:57

Modified files:
        server/source/linc: linc.py 

Log message:
        Add caches; fix sorting the links by protocol.

CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.10&r2=1.11

Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- linc.py     14 Feb 2013 12:39:21 -0000      1.10
+++ linc.py     18 Feb 2013 15:29:50 -0000      1.11
@@ -20,7 +20,7 @@
 
 # defines
 
-LINC_VERSION = 'LINC 0.6'
+LINC_VERSION = 'LINC 0.7'
 COPYRIGHT= \
 'Copyright (C) 2011-2012 Waclaw Jacek\n\
 Copyright (C) 2013 Free Software Foundation, Inc.\n\
@@ -76,6 +76,8 @@
 # Don't download the files, assume no error.
 LOCAL = False
 
+CACHE = None
+
 # regexp-related defines
 
 # Matching directories will not be entered to check their
@@ -96,7 +98,7 @@
   'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
 HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
 # Links like href="mailto:..."; and href="irc:..." are excluded.
-LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mi].+?)"( .+?)?>'
+LINK_REGEXP = '<a(\s.+?)?\shref="(?P<link>[^"]+)"(\s.+?)?>'
 TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
 
 VERBOSE = 0
@@ -112,6 +114,8 @@
 # global variables
 
 files_to_check = []
+remote_site_root = None
+remote_base_directory = None
 
 # functions
        
@@ -122,9 +126,10 @@
        
 def get_ftp_link_error( link ):
        connection_data = re.search( FTP_LINK_REGEXP, link )
-       if connection_data:
-               hostname = connection_data.group( 'hostname' )
-               port = connection_data.group( 'port' )
+       if not connection_data:
+               return None
+       hostname = connection_data.group('hostname')
+       port = connection_data.group('port')
                
                if port == None:
                        port = 21
@@ -135,7 +140,7 @@
                if socketfd == None:
                        return None
                        
-               if socket_connect( socketfd, hostname, port ) == False:
+       if socket_connect(socketfd, hostname, port) == False:
                        socketfd.close()
                        return 'couldn\'t connect to host'
                        
@@ -300,7 +305,62 @@
        print COPYRIGHT
        exit(0)
        
-### OK, main program below.
+def classify_link(filename, link):
+       link_type = 'http'
+       if re.search('^(mailto:|irc://|https://)', link):
+               link_type = 'unsupported'
+       elif link.find('http://') == 0:
+               link_type = 'http'
+       elif link.find('ftp://') == 0:
+               link_type = 'ftp'
+       elif link[0] == '/':
+               link = remote_site_root + link[1:]
+       else:
+               subdir = ''
+               pos = filename.rfind('/')
+               if pos != -1:
+                       subdir = filename[: pos] + '/'
+               link = remote_base_directory + subdir + link
+       return [link_type, link]
+
+def load_cache(cache):
+       if cache == None:
+               if VERBOSE > 2:
+                       print "No cache file is loaded."
+               return []
+       try:
+               f = open(cache, 'r')
+       except IOError:
+               if VERBOSE > -3:
+                       print "Failed to read cache file `" + cache + "'."
+               return []
+       if VERBOSE > 2:
+               print "Loading cache file `" + cache +"'."
+       links = f.read().splitlines()
+       f.close();
+       if VERBOSE > 2:
+               print "Loaded links: " + str(len(links))
+       return links
+
+def save_cache(cache, checked_links):
+       if cache == None:
+               if VERBOSE > 2:
+                       print "No cache file is saved."
+               return
+       try:
+               f = open(cache, 'w')
+       except IOError:
+               if VERBOSE > -3:
+                       print "Failed to write cache file `" + cache + "'."
+               return
+       if VERBOSE > 2:
+               print "\nSaving cache file `" + cache +"'."
+       for link in checked_links:
+               # Links containing a newline are not cached
+               # because newline is used in cache as the separator.
+               if link['error'] == None and link['link'].find('\n') == -1:
+                       f.write(link['link'] + '\n')
+       f.close()
 
 usage = \
 'Usage: %prog [options] [BASE_DIRECTORY]\n\
@@ -315,6 +375,8 @@
                  type = 'float', metavar = 'DELAY',
                   help = 'delay between checks in seconds [' \
                        + str(DELAY_BETWEEN_CHECKS) + ']')
+parser.add_option('-C', '--cache', dest = 'cache', metavar = 'FILE',
+                 help = 'use cache FILE')
 parser.add_option('-f', '--forwards', dest = 'forwards', type = 'int',
                  metavar = 'N',
                   help = 'maximum number of forwards to follow [' \
@@ -397,6 +459,7 @@
        EXCLUDED_DIRECTORIES_REGEXP = options.exclude_dir
 if options.local != None:
        LOCAL = options.local
+CACHE = options.cache
 
 base_directory = BASE_DIRECTORY
 remote_base_directory = REMOTE_BASE_DIRECTORY
@@ -415,6 +478,8 @@
 
 if VERBOSE > 0:
        print "Base directory:       `" + BASE_DIRECTORY + "'"
+       print "Cache file:           " + \
+               ("`" + CACHE + "'" if CACHE else "(None)")
        print "Number of attempts:    " + str(NUMBER_OF_ATTEMPTS)
        print "Delay between checks:  " + str(DELAY_BETWEEN_CHECKS)
        print "Delay between retries: " + str(DELAY_BETWEEN_RETRIES)
@@ -469,6 +534,9 @@
 
 number_of_links_to_check = str( len( links_to_check ) )
 already_checked_links = []
+cached_links = load_cache(CACHE)
+for link in cached_links:
+       already_checked_links.append({'link': link, 'error': None})
 for j in range(NUMBER_OF_ATTEMPTS):
     if VERBOSE > -2:
        print 'Pass ' + str(j + 1) + ' of ' + str(NUMBER_OF_ATTEMPTS) + ':'
@@ -480,34 +548,17 @@
 
        filename = link_container['filename']
        link = link_container['link']
-       if link_container['is_inside_comment'] == 'ssi':
-               continue
-       if link[0] == '#':
+       if link_container['is_inside_comment'] == 'ssi' or link[0] == '#':
                continue
 
-       link_type = None
-       if VERBOSE > 2 and link[0] != '/' and link[0] != '#' \
-                       and link.find('://') == -1:
+       if VERBOSE > 2 and link[0] != '/' and not re.search('^[^/]*:', link):
                print '\n' + filename + ':' \
                      + str(link_container['line_number']) + ': link ' \
                      + str(i) + ' `' + link + "' is relative"
 
-       if link[:6] == 'ftp://':
-               link_type = 'ftp'
-       elif link[:7] == 'http://':
-               link_type = 'http'
-       elif link[:8] == 'https://':
-               link_type = 'https'
-       elif link[0] == '/':
-               link_type = 'http'
-               link = remote_site_root + link[1:]
-       else:
-               link_type = 'http'
-               subdir = ''
-               pos = filename.rfind( '/' )
-               if pos != -1:
-                       subdir = filename[: pos] + '/'
-               link = remote_base_directory + subdir + link
+       [link_type, link] = classify_link(filename, link)
+       if link_type == 'unsupported':
+               continue
                        
        link_id = -1
        link_error = None
@@ -528,17 +579,16 @@
        elif link_type == 'http':
                link_error = get_http_link_error( link )
        else:
-               continue # ignore the link,
-                        # since its protocol is unsupported
+               continue
        if checked_link != None:
                if link_error == None:
                        already_checked_links[link_id]['error'] = None
        else:
                already_checked_links.append( { 'link': link, \
                                        'error': link_error } )
-
        if DELAY_BETWEEN_CHECKS > 0:
                time.sleep(DELAY_BETWEEN_CHECKS)
+    save_cache(CACHE, already_checked_links)
     if VERBOSE > -2:
        broken_so_far = 0
        for checked_link in already_checked_links:
@@ -552,7 +602,7 @@
                                print 'link ' + str(i) + ': ' \
                                + checked_link['link'] + ': ' \
                                + (checked_link['error'] \
-                                    if checked_link['error'] else '')
+                                    if checked_link['error'] else '(no error)')
        if broken_so_far == 0:
                print 'No more broken links; skipping the rest passes (if any)'
                break
@@ -578,26 +628,11 @@
        if link[0] == '#':
                continue
 
-       link_type = None
-
-       if link[:6] == 'ftp://':
-               link_type = 'ftp'
-       elif link[:7] == 'http://':
-               link_type = 'http'
-       elif link[:8] == 'https://':
-               link_type = 'https'
-       elif link[0] == '/':
-               link_type = 'http'
-               link = remote_site_root + link[1:]
-       else:
-               link_type = 'http'
-               subdir = ''
-               pos = filename.rfind( '/' )
-               if pos != -1:
-                       subdir = filename[: pos] + '/'
-               link = remote_base_directory + subdir + link
+       [link_type, link] = classify_link(filename, link)
                        
-       if link_type != 'ftp' and link_type != 'http':
+       if link_type == 'unsupported':
+               if VERBOSE > 2:
+                       print 'Note: link `' + link + "' is not supported."
                continue
        link_id = -1
        for i, checked_link in enumerate(already_checked_links):
[Prev in Thread]
Current Thread
[Next in Thread]
www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/01
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/07
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/14
- www/server/source/linc linc.py, Pavel Kharitonov <=
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/18
Prev by Date: www press/2002-03-01-pi-MySQL.html server/gnun/...
Next by Date: www philosophy/right-to-read.fi.html philosophy...
Previous by thread: www/server/source/linc linc.py
Next by thread: www/server/source/linc linc.py
Index(es):
- Date
- Thread