[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py
From: |
Pavel Kharitonov |
Subject: |
www/server/source/linc linc.py |
Date: |
Mon, 18 Feb 2013 15:29:57 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Pavel Kharitonov <ineiev> 13/02/18 15:29:57
Modified files:
server/source/linc: linc.py
Log message:
Add caches; fix sorting the links by protocol.
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.10&r2=1.11
Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -b -r1.10 -r1.11
--- linc.py 14 Feb 2013 12:39:21 -0000 1.10
+++ linc.py 18 Feb 2013 15:29:50 -0000 1.11
@@ -20,7 +20,7 @@
# defines
-LINC_VERSION = 'LINC 0.6'
+LINC_VERSION = 'LINC 0.7'
COPYRIGHT= \
'Copyright (C) 2011-2012 Waclaw Jacek\n\
Copyright (C) 2013 Free Software Foundation, Inc.\n\
@@ -76,6 +76,8 @@
# Don't download the files, assume no error.
LOCAL = False
+CACHE = None
+
# regexp-related defines
# Matching directories will not be entered to check their
@@ -96,7 +98,7 @@
'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
# Links like href="mailto:..." and href="irc:..." are excluded.
-LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mi].+?)"( .+?)?>'
+LINK_REGEXP = '<a(\s.+?)?\shref="(?P<link>[^"]+)"(\s.+?)?>'
TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
VERBOSE = 0
@@ -112,6 +114,8 @@
# global variables
files_to_check = []
+remote_site_root = None
+remote_base_directory = None
# functions
@@ -122,9 +126,10 @@
def get_ftp_link_error( link ):
connection_data = re.search( FTP_LINK_REGEXP, link )
- if connection_data:
- hostname = connection_data.group( 'hostname' )
- port = connection_data.group( 'port' )
+ if not connection_data:
+ return None
+ hostname = connection_data.group('hostname')
+ port = connection_data.group('port')
if port == None:
port = 21
@@ -135,7 +140,7 @@
if socketfd == None:
return None
- if socket_connect( socketfd, hostname, port ) == False:
+ if socket_connect(socketfd, hostname, port) == False:
socketfd.close()
return 'couldn\'t connect to host'
@@ -300,7 +305,62 @@
print COPYRIGHT
exit(0)
-### OK, main program below.
+def classify_link(filename, link):
+ link_type = 'http'
+ if re.search('^(mailto:|irc://|https://)', link):
+ link_type = 'unsupported'
+ elif link.find('http://') == 0:
+ link_type = 'http'
+ elif link.find('ftp://') == 0:
+ link_type = 'ftp'
+ elif link[0] == '/':
+ link = remote_site_root + link[1:]
+ else:
+ subdir = ''
+ pos = filename.rfind('/')
+ if pos != -1:
+ subdir = filename[: pos] + '/'
+ link = remote_base_directory + subdir + link
+ return [link_type, link]
+
+def load_cache(cache):
+ if cache == None:
+ if VERBOSE > 2:
+ print "No cache file is loaded."
+ return []
+ try:
+ f = open(cache, 'r')
+ except IOError:
+ if VERBOSE > -3:
+ print "Failed to read cache file `" + cache + "'."
+ return []
+ if VERBOSE > 2:
+ print "Loading cache file `" + cache +"'."
+ links = f.read().splitlines()
+ f.close();
+ if VERBOSE > 2:
+ print "Loaded links: " + str(len(links))
+ return links
+
+def save_cache(cache, checked_links):
+ if cache == None:
+ if VERBOSE > 2:
+ print "No cache file is saved."
+ return
+ try:
+ f = open(cache, 'w')
+ except IOError:
+ if VERBOSE > -3:
+ print "Failed to write cache file `" + cache + "'."
+ return
+ if VERBOSE > 2:
+ print "\nSaving cache file `" + cache +"'."
+ for link in checked_links:
+ # Links containing a newline are not cached
+ # because newline is used in cache as the separator.
+ if link['error'] == None and link['link'].find('\n') == -1:
+ f.write(link['link'] + '\n')
+ f.close()
usage = \
'Usage: %prog [options] [BASE_DIRECTORY]\n\
@@ -315,6 +375,8 @@
type = 'float', metavar = 'DELAY',
help = 'delay between checks in seconds [' \
+ str(DELAY_BETWEEN_CHECKS) + ']')
+parser.add_option('-C', '--cache', dest = 'cache', metavar = 'FILE',
+ help = 'use cache FILE')
parser.add_option('-f', '--forwards', dest = 'forwards', type = 'int',
metavar = 'N',
help = 'maximum number of forwards to follow [' \
@@ -397,6 +459,7 @@
EXCLUDED_DIRECTORIES_REGEXP = options.exclude_dir
if options.local != None:
LOCAL = options.local
+CACHE = options.cache
base_directory = BASE_DIRECTORY
remote_base_directory = REMOTE_BASE_DIRECTORY
@@ -415,6 +478,8 @@
if VERBOSE > 0:
print "Base directory: `" + BASE_DIRECTORY + "'"
+ print "Cache file: " + \
+ ("`" + CACHE + "'" if CACHE else "(None)")
print "Number of attempts: " + str(NUMBER_OF_ATTEMPTS)
print "Delay between checks: " + str(DELAY_BETWEEN_CHECKS)
print "Delay between retries: " + str(DELAY_BETWEEN_RETRIES)
@@ -469,6 +534,9 @@
number_of_links_to_check = str( len( links_to_check ) )
already_checked_links = []
+cached_links = load_cache(CACHE)
+for link in cached_links:
+ already_checked_links.append({'link': link, 'error': None})
for j in range(NUMBER_OF_ATTEMPTS):
if VERBOSE > -2:
print 'Pass ' + str(j + 1) + ' of ' + str(NUMBER_OF_ATTEMPTS) + ':'
@@ -480,34 +548,17 @@
filename = link_container['filename']
link = link_container['link']
- if link_container['is_inside_comment'] == 'ssi':
- continue
- if link[0] == '#':
+ if link_container['is_inside_comment'] == 'ssi' or link[0] == '#':
continue
- link_type = None
- if VERBOSE > 2 and link[0] != '/' and link[0] != '#' \
- and link.find('://') == -1:
+ if VERBOSE > 2 and link[0] != '/' and not re.search('^[^/]*:', link):
print '\n' + filename + ':' \
+ str(link_container['line_number']) + ': link ' \
+ str(i) + ' `' + link + "' is relative"
- if link[:6] == 'ftp://':
- link_type = 'ftp'
- elif link[:7] == 'http://':
- link_type = 'http'
- elif link[:8] == 'https://':
- link_type = 'https'
- elif link[0] == '/':
- link_type = 'http'
- link = remote_site_root + link[1:]
- else:
- link_type = 'http'
- subdir = ''
- pos = filename.rfind( '/' )
- if pos != -1:
- subdir = filename[: pos] + '/'
- link = remote_base_directory + subdir + link
+ [link_type, link] = classify_link(filename, link)
+ if link_type == 'unsupported':
+ continue
link_id = -1
link_error = None
@@ -528,17 +579,16 @@
elif link_type == 'http':
link_error = get_http_link_error( link )
else:
- continue # ignore the link,
- # since its protocol is unsupported
+ continue
if checked_link != None:
if link_error == None:
already_checked_links[link_id]['error'] = None
else:
already_checked_links.append( { 'link': link, \
'error': link_error } )
-
if DELAY_BETWEEN_CHECKS > 0:
time.sleep(DELAY_BETWEEN_CHECKS)
+ save_cache(CACHE, already_checked_links)
if VERBOSE > -2:
broken_so_far = 0
for checked_link in already_checked_links:
@@ -552,7 +602,7 @@
print 'link ' + str(i) + ': ' \
+ checked_link['link'] + ': ' \
+ (checked_link['error'] \
- if checked_link['error'] else '')
+ if checked_link['error'] else '(no error)')
if broken_so_far == 0:
print 'No more broken links; skipping the rest passes (if any)'
break
@@ -578,26 +628,11 @@
if link[0] == '#':
continue
- link_type = None
-
- if link[:6] == 'ftp://':
- link_type = 'ftp'
- elif link[:7] == 'http://':
- link_type = 'http'
- elif link[:8] == 'https://':
- link_type = 'https'
- elif link[0] == '/':
- link_type = 'http'
- link = remote_site_root + link[1:]
- else:
- link_type = 'http'
- subdir = ''
- pos = filename.rfind( '/' )
- if pos != -1:
- subdir = filename[: pos] + '/'
- link = remote_base_directory + subdir + link
+ [link_type, link] = classify_link(filename, link)
- if link_type != 'ftp' and link_type != 'http':
+ if link_type == 'unsupported':
+ if VERBOSE > 2:
+ print 'Note: link `' + link + "' is not supported."
continue
link_id = -1
for i, checked_link in enumerate(already_checked_links):