[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/linc linc.py
From: |
Pavel Kharitonov |
Subject: |
www/server/source/linc linc.py |
Date: |
Mon, 03 Dec 2012 10:22:57 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Pavel Kharitonov <ineiev> 12/12/03 10:22:57
Modified files:
server/source/linc: linc.py
Log message:
Next revision (RT #786229):
Limit maximum length of lines.
Improve some diagnostic messages.
Add 'education/draft' to excluded directories.
Use 'Lynx/2.8.6rel.5' rather than 'LINC/alpha' as the 'User-Agent'
header.
Report commented out links (including the working ones) in separate
files.
Eliminate the 'SEPARATE_TRANSLATION_REPORTS' variable; translation
reports
are always separated.
Open, append and close the report files on every report rather than keep
them all open at once.
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.3&r2=1.4
Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -b -r1.3 -r1.4
--- linc.py 2 Apr 2012 00:04:40 -0000 1.3
+++ linc.py 3 Dec 2012 10:22:57 -0000 1.4
@@ -2,6 +2,7 @@
#
# LINC - LINC Is Not Checklink
# Copyright © 2011-2012 WacÅaw Jacek
+# Copyright © 2012 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -18,28 +19,67 @@
# defines
-BASE_DIRECTORY = '/home/w/wwj/www-repo/'
+BASE_DIRECTORY = '/home/g/gnun/checkouts/www/'
REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
-ADDITIONAL_HTTP_HEADERS = 'User-Agent: LINC/alpha\r\nAccept: text/html,
text/plain, audio/mod, image/*, application/msword, application/pdf,
application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n' # end every
header with "\r\n"
-DELAY_BETWEEN_CHECKS = 1 # In seconds. Set to 0 to disable delay between
checks of different links.
-DELAY_BETWEEN_RETRIES = 10 # In seconds. Used when a link fails before
re-checking it. Set to 0 to disable delays.
+# End every header with "\r\n"
+# We say we are like Lynx because some ignorant sites like Sourceforge don't
+# know what LINC/alpha is and still discriminate on User-Agent with the default
+# behavior being inappropriate for us.
+ADDITIONAL_HTTP_HEADERS = 'User-Agent: Lynx/2.8.6rel.5\r\n\
+Accept: text/html, text/plain, audio/mod, image/*, application/msword, \
+application/pdf, application/postscript, text/sgml, video/mpeg, */*;q=0.01\r\n'
+
+# In seconds. Set negative to disable delay between checks of different links.
+DELAY_BETWEEN_CHECKS = 1
+
+# In seconds. Used when a link fails before re-checking it.
+# Set negative to disable delays.
+DELAY_BETWEEN_RETRIES = 10
+
FORWARDS_TO_FOLLOW = 5 # How many forwards should be followed.
-NUMBER_OF_ATTEMPTS = 3 # Number of times to check a link for error. If an
attempt is successful, the link is no longer checked during that program run.
-REPORT_FILE_NAME = 'reports-temp/broken_links' # Path to the file to which the
errors will be reported.
-SEPARATE_TRANSLATION_REPORTS = True # If you set this to True, reports for
translations will be saved into "REPORT_FILE_NAME.lang" instead of in the main
report file.
-SKIP_TRANSLATION_FILES = False # If you set this to True, files with
translations will be skipped.
-SOCKET_TIMEOUT = 20 # After what time to give up with trying to retrieve a
website.
+
+# Number of times to check a link for error. If an attempt is successful,
+# the link is no longer checked during that program run.
+NUMBER_OF_ATTEMPTS = 3
+
+# Path to the file to which the errors will be reported.
+# Note: this is typically a temporary directory because
+# linc run takes many hours (almost a day), and the old results
+# wouldn't be available during that period if new files went
+# directly to the destination directory.
+REPORT_FILE_PREFIX = 'reports-temp'
+
+# File to which the errors will be reported.
+REPORT_FILE_NAME = 'broken_links'
+
+# File to which commented out links will be reported.
+COMMENTED_FILE_NAME = 'commented_out'
+
+# If you set this to True, files with translations will be skipped.
+SKIP_TRANSLATION_FILES = False
+
+# After what time to give up with trying to retrieve a website.
+SOCKET_TIMEOUT = 20
# regexp-related defines
-EXCLUDED_DIRECTORIES_REGEXP =
'^(japan|wwwes|wwwin|education/fr|press|server/staging|software/[^/]+)$|(^|/)po$'
# Matching directories will not be entered to check their files or
subdirectories.
-EXCLUDED_FILENAMES_REGEXP =
'^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
+# Matching directories will not be entered to check their
+# files or subdirectories.
+EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwes|wwwin|education/fr|\
+education/draft|press|server/staging|software/[^/]+)$|(^|/)po$'
+EXCLUDED_FILENAMES_REGEXP = \
+ '^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
+
FILENAMES_TO_CHECK_REGEXP = '\.html$' # Only matching files will be checked.
+
FTP_LINK_REGEXP = 'ftp://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?'
-HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) ' # What to treat
as a HTTP error header.
+
+# What to treat as a HTTP error header.
+HTTP_ERROR_HEADER = '^HTTP/1\.1 (?P<http_error_code>403|404) '
HTTP_FORWARD_HEADER = '^HTTP/1\.1 (301 Moved Permanently|302 Found)$'
-HTTP_LINK_REGEXP =
'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
+HTTP_LINK_REGEXP = \
+ 'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>'
TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
@@ -59,7 +99,9 @@
# functions
def format_error( filename, line_number, link, error_message ):
- return str( filename ) + ':' + str( line_number ) + ': ' + str( link
).replace( ' ', '%20' ) + ' ' + str( error_message ) + '\n'
+ return str( filename ) + ':' + str( line_number ) + ': ' \
+ + str( link ).replace( ' ', '%20' ) + ' ' \
+ + str( error_message ) + '\n'
def get_ftp_link_error( link ):
connection_data = re.search( FTP_LINK_REGEXP, link )
@@ -71,7 +113,9 @@
port = 21
socketfd = socket_create()
- if socketfd == None: # if a socket couldn't be created, just
ignore this link this time.
+ # if a socket couldn't be created,
+ # just ignore this link this time.
+ if socketfd == None:
return None
if socket_connect( socketfd, hostname, port ) == False:
@@ -81,12 +125,14 @@
socketfd.close()
return None
-def get_http_link_error( link, forwarded_from = None ): # forwarded_from is
either None or a list
+# forwarded_from is either None or a list
+def get_http_link_error( link, forwarded_from = None ):
if forwarded_from == None:
forwarded_from = []
connection_data = re.search( HTTP_LINK_REGEXP, link )
- if connection_data:
+ if not connection_data:
+ return None
hostname = connection_data.group( 'hostname' )
port = connection_data.group( 'port' )
resource = connection_data.group( 'resource' )
@@ -95,7 +141,9 @@
port = 80
socketfd = socket_create()
- if socketfd == None: # if a socket couldn't be created, just
ignore this link this time.
+ # if a socket couldn't be created,
+ # just ignore this link this time.
+ if socketfd == None:
return None
if socket_connect( socketfd, hostname, port ) == False:
@@ -105,7 +153,8 @@
if resource == None:
resource = '/'
- socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' +
hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
+ socketfd.send( 'GET ' + resource + ' HTTP/1.1\r\nHost: ' \
+ + hostname + '\r\n' + ADDITIONAL_HTTP_HEADERS + '\r\n' )
webpage = socket_read( socketfd )
if webpage == None:
@@ -116,7 +165,8 @@
end_of_headers_pos = webpage.find( '\r\n\r\n' )
if end_of_headers_pos == -1:
- return 'couldn\'t find end of headers (possibly no
content in file)'
+ return 'couldn\'t find end of ' \
+ + 'headers (possibly no content in file)'
header_lines = webpage[ : end_of_headers_pos ]
header_lines = header_lines.split( '\r\n' )
@@ -129,69 +179,76 @@
# look for forwards
match = regexp_search_list( HTTP_FORWARD_HEADER, header_lines )
+ if not match:
+ return None
+ # if we haven't been forwarded too many times yet...
+ if len( forwarded_from ) >= FORWARDS_TO_FOLLOW:
+ # we've been forwarded too many times, sorry.
+ return 'too many forwards (over ' \
+ + str( len( forwarded_from ) ) + ')'
+ match = regexp_search_list(HTTP_NEW_LOCATION_HEADER, header_lines)
if match:
- if len( forwarded_from ) < FORWARDS_TO_FOLLOW: # if we
haven't been forwarded too many times yet...
- match = regexp_search_list(
HTTP_NEW_LOCATION_HEADER, header_lines )
- if match:
- forwarded_from.append( link )
- new_location = match.group(
'new_location' )
+ forwarded_from.append(link)
+ new_location = match.group('new_location')
if new_location in forwarded_from:
return 'forward loop!'
- else:
- return get_http_link_error(
new_location, forwarded_from )
- else: # we've been forwarded too many times, sorry.
- return 'too many forwards (over ' + str( len(
forwarded_from ) ) + ')'
-
+ return get_http_link_error(new_location, forwarded_from)
return None
-def is_match_inside_comment_block( regexp_match ):
+def is_match_inside_comment( regexp_match ):
haystack = regexp_match.string
match_pos = regexp_match.start()
- comment_block_start = haystack.rfind( '<!--', 0, match_pos )
- comment_block_end = haystack.rfind( '-->', 0, match_pos )
+ comment_block_start = haystack.rfind('<!--', 0, match_pos)
+ comment_block_end = haystack.rfind('-->', 0, match_pos)
if comment_block_start > comment_block_end:
return True
return False
-def regexp_search_list( regexp, the_list ):
+def regexp_search_list(regexp, the_list):
for list_element in the_list:
- match = re.search( regexp, list_element )
+ match = re.search(regexp, list_element)
if match:
return match
return None
-def search_directory_for_files( base_directory, directory ):
- for element_name in os.listdir( os.path.join( base_directory, directory
) ):
- relative_path_to_element = os.path.join( directory,
element_name )
- full_path_to_element = os.path.join( base_directory,
relative_path_to_element )
- if os.path.isdir( full_path_to_element ):
- if re.search( EXCLUDED_DIRECTORIES_REGEXP,
relative_path_to_element ):
+def search_directory_for_files(base_directory, directory):
+ for element_name in os.listdir(os.path.join(base_directory, directory)):
+ relative_path_to_element = os.path.join(directory, element_name)
+ full_path_to_element = os.path.join(base_directory, \
+ relative_path_to_element)
+ if os.path.isdir(full_path_to_element):
+ if re.search(EXCLUDED_DIRECTORIES_REGEXP, \
+ relative_path_to_element):
continue
- search_directory_for_files( base_directory,
relative_path_to_element )
+ search_directory_for_files(base_directory, \
+ relative_path_to_element)
else: # it's a file
- if not re.search( FILENAMES_TO_CHECK_REGEXP,
element_name ):
+ if not re.search(FILENAMES_TO_CHECK_REGEXP, \
+ element_name):
continue
- if ( SKIP_TRANSLATION_FILES == True ) and re.search(
TRANSLATION_REGEXP, element_name ):
+ if (SKIP_TRANSLATION_FILES == True) \
+ and re.search(TRANSLATION_REGEXP, element_name):
continue
- if re.search( EXCLUDED_FILENAMES_REGEXP,
relative_path_to_element ):
+ if re.search(EXCLUDED_FILENAMES_REGEXP, \
+ relative_path_to_element):
continue
- files_to_check.append( relative_path_to_element )
+ files_to_check.append(relative_path_to_element)
-def socket_connect( socketfd, hostname, port ):
+def socket_connect(socketfd, hostname, port):
try:
- socketfd.connect( ( hostname, int( port ) ) )
+ socketfd.connect((hostname, int(port)))
except socket.error, message:
return False
return True
def socket_create():
try:
- socketfd = socket.socket( socket.AF_INET, socket.SOCK_STREAM )
- socketfd.settimeout( SOCKET_TIMEOUT )
+ socketfd = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ socketfd.settimeout(SOCKET_TIMEOUT)
except socket.error, message:
return None
return socketfd
@@ -204,25 +261,36 @@
return None
return output
+def clear_file( name ):
+ fd = open( name, 'w' )
+ fd.close()
+
### OK, main program below.
+if REPORT_FILE_PREFIX[-1] != '/':
+ REPORT_FILE_PREFIX += '/'
+
+REPORT_FILE_NAME = REPORT_FILE_PREFIX + REPORT_FILE_NAME
+COMMENTED_FILE_NAME = REPORT_FILE_PREFIX + COMMENTED_FILE_NAME
+
base_directory = BASE_DIRECTORY
remote_base_directory = REMOTE_BASE_DIRECTORY
if remote_base_directory[-1] != '/':
remote_base_directory += '/'
-remote_site_root = remote_base_directory[ : ( remote_base_directory.find( '/',
( remote_base_directory.find( '://' ) + 3 ) ) + 1 ) ]
+pos = remote_base_directory.find( '://' ) + 3
+pos = remote_base_directory.find( '/', pos) + 1
+remote_site_root = remote_base_directory[ : pos ]
# `cd` to this path
if not os.path.isdir( base_directory ):
- print 'Entered base directory isn\'t a directory (or doesn\'t exist at
all).'
+ print 'Base directory', \
+ "`" + base_directory + "'", 'not found.'
sys.exit( 1 )
-# list files
print 'Recursively listing all files in the selected directory...'
-search_directory_for_files( base_directory, '' )
+search_directory_for_files( base_directory, '')
-# find links
links_to_check = []
print 'Listing files done. Looking for links...'
for file_to_check in files_to_check:
@@ -236,33 +304,41 @@
line_number = -1
split_file_contents = file_contents.split( '\n' )
- for checked_line_number, line in enumerate( split_file_contents
):
- checked_line_number += 1 # so that line numbers don't
start from 0
+ for checked_line_number, line \
+ in enumerate( split_file_contents ):
+ checked_line_number += 1 # so that line numbers
+ # don't start from 0
if line.find( link ) != -1:
line_number = checked_line_number
break
- link_container = { 'filename': file_to_check, 'line_number':
line_number, 'link': link, 'is_link_inside_comment_block':
is_match_inside_comment_block( match ) }
+ link_container = { 'filename': file_to_check, \
+ 'line_number': line_number, \
+ 'link': link, \
+ 'is_inside_comment': \
+ is_match_inside_comment( match ) }
links_to_check.append( link_container )
-# check links
print 'Alright, I\'ve got all the links. Let\'s check them now!'
-# open report files for writing
-report_file = open( REPORT_FILE_NAME, 'w' )
+report_file = REPORT_FILE_NAME
+clear_file( report_file )
+commented_file = COMMENTED_FILE_NAME
+clear_file( commented_file )
translation_report_files = {}
number_of_links_to_check = str( len( links_to_check ) )
already_checked_links = []
for i, link_container in enumerate( links_to_check ):
if i % 10 == 0:
- print '\rChecking link ' + str( i + 1 ) + ' of ' +
number_of_links_to_check + '...',
+ print '\rChecking link ' + str( i + 1 ) + ' of ' \
+ + number_of_links_to_check + '...',
sys.stdout.flush()
filename = link_container['filename']
line_number = link_container['line_number']
link = link_container['link']
- is_link_inside_comment_block =
link_container['is_link_inside_comment_block']
+ is_inside_comment = link_container['is_inside_comment']
link_type = None
@@ -280,24 +356,22 @@
link = remote_site_root + link[1:]
else:
link_type = 'http'
-
- last_slash_pos = filename.rfind( '/' )
- subdirectory = ''
- if last_slash_pos != -1:
- subdirectory = filename[ : last_slash_pos ] +
'/'
-
- link = remote_base_directory + subdirectory + link
+ subdir = ''
+ pos = filename.rfind( '/' )
+ if pos != -1:
+ subdir = filename[: pos] + '/'
+ link = remote_base_directory + subdir + link
link_already_checked = False
- link_id_if_already_checked = -1
+ link_id = -1
for i, already_checked_link in enumerate( already_checked_links ):
if link == already_checked_link['link']:
link_already_checked = True
- link_id_if_already_checked = i
+ link_id = i
break
if link_already_checked:
- link_error =
already_checked_links[link_id_if_already_checked]['error']
+ link_error = already_checked_links[link_id]['error']
else:
link_error = None
for i in range( NUMBER_OF_ATTEMPTS ):
@@ -310,35 +384,44 @@
if link_error == None:
break
else:
- break # ignore the link, since its protocol is
unsupported
+ break # ignore the link,
+ # since its protocol is unsupported
- if DELAY_BETWEEN_RETRIES != 0:
+ if DELAY_BETWEEN_RETRIES > 0:
time.sleep( DELAY_BETWEEN_RETRIES )
- already_checked_links.append( { 'link': link, 'error':
link_error } )
+ already_checked_links.append( { 'link': link, \
+ 'error': link_error } )
+
+ # Report working links inside comments so that webmasters
+ # could uncomment them.
+ if link_error == None and is_inside_comment:
+ link_error = 'no error detected'
if link_error != None:
- if is_link_inside_comment_block:
+ if is_inside_comment:
link_error += ' (link commented out)'
+ file_to_write = commented_file
+ postfix = '/c'
+ else:
+ file_to_write = report_file
+ postfix = '/0'
- if not SKIP_TRANSLATION_FILES:
match = re.search( TRANSLATION_REGEXP, filename )
- if match and SEPARATE_TRANSLATION_REPORTS:
+ if match:
langcode = match.group( 'langcode' )
- if langcode not in translation_report_files:
- translation_report_files[langcode] =
open( REPORT_FILE_NAME + '-' + langcode, 'w' )
- translation_report_files[langcode].write(
format_error( filename, line_number, link, link_error ) )
- else:
- report_file.write( format_error( filename,
line_number, link, link_error ) )
- else:
- report_file.write( format_error( filename, line_number,
link, link_error ) )
+ file_idx = langcode + postfix
+ if file_idx not in translation_report_files:
+ translation_report_files[file_idx] = \
+ file_to_write + '-' + langcode
+ clear_file(translation_report_files[file_idx])
+ file_to_write = translation_report_files[file_idx]
+ fd = open(file_to_write, 'a')
+ fd.write(format_error(filename, line_number, \
+ link, link_error))
+ fd.close()
- if DELAY_BETWEEN_CHECKS != 0:
+ if DELAY_BETWEEN_CHECKS > 0:
time.sleep( DELAY_BETWEEN_CHECKS )
-for langcode in translation_report_files:
- translation_report_files[langcode].close()
-report_file.close()
-
print '\nDone! :-)'
-
- www/server/source/linc linc.py,
Pavel Kharitonov <=