www/server/source/linc linc.py

www-commits

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

www/server/source/linc linc.py

From:	Pavel Kharitonov
Subject:	www/server/source/linc linc.py
Date:	Fri, 01 Feb 2013 14:26:52 +0000

CVSROOT:        /web/www
Module name:    www
Changes by:     Pavel Kharitonov <ineiev>       13/02/01 14:26:52

Modified files:
        server/source/linc: linc.py 

Log message:
        Put report files to current directory by default.
        Use BASE_DIRECTORY relative to linc.py location by default.
        New option: `--local'.
        Don't exclude `wwwes' directory by default (it doesn't exist
        any more).
        Add more diagnostics when VERBOSE > 1.

CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/linc/linc.py?cvsroot=www&r1=1.7&r2=1.8

Patches:
Index: linc.py
===================================================================
RCS file: /web/www/www/server/source/linc/linc.py,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -b -r1.7 -r1.8
--- linc.py     14 Jan 2013 16:27:53 -0000      1.7
+++ linc.py     1 Feb 2013 14:26:48 -0000       1.8
@@ -20,7 +20,7 @@
 
 # defines
 
-LINC_VERSION = 'LINC 0.3'
+LINC_VERSION = 'LINC 0.4'
 COPYRIGHT= \
 'Copyright (C) 2011-2012 Waclaw Jacek\n\
 Copyright (C) 2013 Free Software Foundation, Inc.\n\
@@ -30,7 +30,7 @@
 \n\
 Written by Waclaw Jacek.'
 
-BASE_DIRECTORY = '/home/g/gnun/checkouts/www/'
+BASE_DIRECTORY = ''
 REMOTE_BASE_DIRECTORY = 'http://www.gnu.org/'
 
 # End every header with "\r\n"
@@ -59,7 +59,7 @@
 # linc run takes many hours (almost a day), and the old results
 # wouldn't be available during that period if new files went
 # directly to the destination directory.
-REPORT_FILE_PREFIX = 'reports-temp'
+REPORT_FILE_PREFIX = './'
 
 # File to which the errors will be reported.
 REPORT_FILE_NAME = 'broken_links'
@@ -73,11 +73,14 @@
 # After what time to give up with trying to retrieve a website.
 SOCKET_TIMEOUT = 20
 
+# Don't download the files, assume no error.
+LOCAL = False
+
 # regexp-related defines
 
 # Matching directories will not be entered to check their
 # files or subdirectories.
-EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwes|wwwin|education/fr|\
+EXCLUDED_DIRECTORIES_REGEXP = '^(japan|wwwin|education/fr|\
 education/draft|press|server/staging|software/[^/]+)$|(^|/)po$'
 EXCLUDED_FILENAMES_REGEXP = \
   '^server/standards/boilerplate\.html|server/.*whatsnew\.html$'
@@ -92,7 +95,7 @@
 HTTP_LINK_REGEXP = \
   'http://(?P<hostname>[^/:]+)(:(?P<port>[0-9]+))?(?P<resource>/[^#]*)?'
 HTTP_NEW_LOCATION_HEADER = '^Location: (?P<new_location>.+)$'
-LINK_REGEXP = '<a( .+?)? href="(?P<link>[^mailto:].+?)"( .+?)?>'
+LINK_REGEXP = '<a( .+?)? href="(?P<link>[^m].+?)"( .+?)?>'
 TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})\.[^.]+$'
 
 VERBOSE = 0
@@ -178,6 +181,11 @@
        
        end_of_headers_pos = webpage.find( '\r\n\r\n' )
        if end_of_headers_pos == -1:
+               if VERBOSE > 1:
+                       print 'No end of headers found on webpage'
+                       print '- - - - -'
+                       print webpage
+                       print '- - - - -'
                return 'couldn\'t find end of ' \
                        + 'headers (possibly no content in file)'
                
@@ -309,6 +317,9 @@
                  metavar = 'N',
                   help = 'maximum number of forwards to follow [' \
                         + str(FORWARDS_TO_FOLLOW) + ']')
+parser.add_option('-l', '--local', dest = 'local', action = 'store_true',
+                 default = False,
+                 help = "don't download files, assume no error")
 parser.add_option('-o', '--output', dest = 'dir_name', metavar = 'DIRECTORY',
                  help = 'write reports to DIRECTORY [' \
                         + REPORT_FILE_PREFIX + ']')
@@ -350,6 +361,12 @@
 
 if len(args) != 0:
        BASE_DIRECTORY = args[0]
+else: 
+       prog_dir = sys.argv[0]
+       pos = prog_dir.rfind('/')
+       prog_dir = prog_dir[ : pos] if (pos != -1) else './'
+       # This script's place is /server/source/linc
+       BASE_DIRECTORY = os.path.abspath(os.path.join(prog_dir, '../../..'))
 
 if options.quiet != None:
        VERBOSE -= options.quiet
@@ -376,6 +393,8 @@
        EXCLUDED_FILENAMES_REGEXP = options.exclude
 if options.exclude_dir != None:
        EXCLUDED_DIRECTORIES_REGEXP = options.exclude_dir
+if options.local != None:
+       LOCAL = options.local
 
 base_directory = BASE_DIRECTORY
 remote_base_directory = REMOTE_BASE_DIRECTORY
@@ -404,6 +423,7 @@
        print "Base URL:             `" + REMOTE_BASE_DIRECTORY + "'"
        print "Excluded files:       `" + EXCLUDED_FILENAMES_REGEXP + "'"
        print "Excluded directories: `" + EXCLUDED_DIRECTORIES_REGEXP + "'"
+       print "Run locally:          `" + ('yes' if LOCAL else 'no') + "'"
 
 # `cd` to this path
 if not os.path.isdir( base_directory ):
@@ -500,7 +520,9 @@
                if already_checked_links[link_id]['error'] == None:
                        continue
                checked_link = already_checked_links[link_id]
-       if link_type == 'ftp':
+       if LOCAL:
+               link_error = None
+       elif link_type == 'ftp':
                link_error = get_ftp_link_error( link )
        elif link_type == 'http':
                link_error = get_http_link_error( link )
@@ -523,6 +545,13 @@
                        broken_so_far = broken_so_far + 1
        print '\n' + str(len(already_checked_links)) + ' unique links, ' \
              + str(broken_so_far) + ' seem broken'
+       if VERBOSE > 1:
+               for i, checked_link in enumerate(already_checked_links):
+                       if checked_link['error'] != None or VERBOSE > 2:
+                               print 'link ' + str(i) + ': ' \
+                               + checked_link['link'] + ': ' \
+                               + (checked_link['error'] \
+                                    if checked_link['error'] else '')
        if broken_so_far == 0:
                print 'No more broken links; skipping the rest passes (if any)'
                break

[Prev in Thread]

Current Thread

[Next in Thread]

www/server/source/linc linc.py, Pavel Kharitonov <=
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/07
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/14
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/18
- www/server/source/linc linc.py, Pavel Kharitonov, 2013/02/18

Prev by Date: www people/po/people.translist server/banner.fi...
Next by Date: www/server/source/sitemap-generator sitemap-gen...
Previous by thread: www people/po/people.translist server/banner.fi...
Next by thread: www/server/source/linc linc.py
Index(es):
- Date
- Thread