[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
www/server/source/sitemap-generator sitemap-gen...
From: |
Pavel Kharitonov |
Subject: |
www/server/source/sitemap-generator sitemap-gen... |
Date: |
Thu, 07 May 2015 15:12:57 +0000 |
CVSROOT: /web/www
Module name: www
Changes by: Pavel Kharitonov <ineiev> 15/05/07 15:12:57
Modified files:
server/source/sitemap-generator: sitemap-generator.py
Log message:
Generate sitemaps for crawlers RT #1009886.
CVSWeb URLs:
http://web.cvs.savannah.gnu.org/viewcvs/www/server/source/sitemap-generator/sitemap-generator.py?cvsroot=www&r1=1.12&r2=1.13
Patches:
Index: sitemap-generator.py
===================================================================
RCS file: /web/www/www/server/source/sitemap-generator/sitemap-generator.py,v
retrieving revision 1.12
retrieving revision 1.13
diff -u -b -r1.12 -r1.13
--- sitemap-generator.py 15 Jul 2014 11:55:53 -0000 1.12
+++ sitemap-generator.py 7 May 2015 15:12:55 -0000 1.13
@@ -40,6 +40,30 @@
# The expression for names of localized sitemap versions.
SITEMAP_REGEXP = 'sitemap\.' + LANGCODE_REGEXP + '\.html'
TOP_DIRECTORY = ''
+
+SITEMAP_MAX_URLS = 50000
+SITEMAP_MAX_LEN = 10485760
+
+URL_ROOT = 'http://www.gnu.org/'
+SITEMAP_BASE = 'sitemap'
+SITEMAP_IDX = ''
+SITEMAP_EXT = '.xml'
+SITEMAP_COMPRESSION = '.gz'
+SITEMAP_URL = URL_ROOT
+SITEMAP_ORG_HEADER = \
+"""<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+ xmlns:xhtml="http://www.w3.org/1999/xhtml">
+"""
+SITEMAP_ORG_FOOTER = '</urlset>\n'
+SITEMAP_ORG_BOILERPLATE_LEN = len (SITEMAP_ORG_HEADER) \
+ + len (SITEMAP_ORG_FOOTER)
+SITEMAP_IDX_HEADER = \
+"""<?xml version="1.0" encoding="UTF-8"?>
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+"""
+SITEMAP_IDX_FOOTER = '</sitemapindex>\n'
+
SITEMAP_DIR = 'server'
TRANSLATION_REGEXP = '\.(?P<langcode>[a-z]{2}|[a-z]{2}-[a-z]{2})' \
+ FILENAMES_TO_LIST_REGEXP
@@ -60,6 +84,7 @@
excluded_dirs = None
excluded_files = None
output_text = ''
+sitemap_text = ''
translist = ''
translation_linguas = []
title_tails = None
@@ -416,17 +441,80 @@
html.close()
return ''
+sitemap_urls = 0
+sitemap_no = 0
+
+def url_entry (base, languages):
+ local_urls = 0
+ entry = ''
+ if len(languages) == 2:
+ languages = ['']
+ for lang in languages:
+ local_urls += 1
+ entry += '<url>\n'
+ entry += ' <loc>' + URL_ROOT + base \
+ + lang + '.html' + '</loc>\n'
+ if len(languages) > 1:
+ for l in languages:
+ hreflang = l
+ if hreflang == '':
+ hreflang = '.x-default'
+ hreflang = hreflang[1:]
+ entry += ' <xhtml:link ' \
+ + 'rel="alternate" hreflang="' \
+ + hreflang + '"\n'
+ entry += ' href="' + URL_ROOT + base \
+ + l + '.html" />\n'
+ entry += '</url>\n'
+ return [ entry, local_urls ]
+
+def append_sitemap_org (directory, base, languages):
+ global sitemap_urls
+ global sitemap_no
+ global sitemap_text
+ len0 = len (sitemap_text)
+ if len(directory):
+ directory += '/'
+ [ entry, local_urls ] = url_entry (directory + base, languages)
+ if len0 + len (entry) \
+ + SITEMAP_ORG_BOILERPLATE_LEN >= SITEMAP_MAX_LEN \
+ or sitemap_urls + local_urls >= SITEMAP_MAX_URLS:
+ sitemap_text = SITEMAP_ORG_HEADER + sitemap_text \
+ + SITEMAP_ORG_FOOTER
+ print 'writing next sitemap (' + str(sitemap_no) + '): ' \
+ + str(len(sitemap_text)) + ' bytes, ' \
+ + str(sitemap_urls) + ' urls'
+ if len0 + len (entry) \
+ + SITEMAP_ORG_BOILERPLATE_LEN >= SITEMAP_MAX_LEN:
+ print ' Maximum length (' \
+ + str(SITEMAP_MAX_LEN) + ') reached'
+ if sitemap_urls >= SITEMAP_MAX_URLS:
+ print ' Maximum URL number (' \
+ + str(SITEMAP_MAX_URLS) + ') reached'
+ out_file = open(SITEMAP_BASE + str(sitemap_no) \
+ + SITEMAP_EXT, 'w')
+ out_file.write(sitemap_text.encode('utf-8'))
+ out_file.close()
+ sitemap_no += 1
+ sitemap_urls = local_urls
+ sitemap_text = entry
+ else:
+ sitemap_text += entry
+ sitemap_urls += local_urls
+
def append_translist (directory, files, base, titles):
global translist
global translation_linguas
item = ''
langs = ''
+ languages = ['', '.en']
for lang in SITE_LINGUAS:
trans = base + '.' + lang + '.html'
if not trans in files:
continue
if not lang in translation_linguas:
translation_linguas.append(lang)
+ languages.append('.' + lang)
emph_open = ''
emph_close = ''
emph = get_outdated_tag (directory, base, lang)
@@ -445,6 +533,7 @@
+ 'hreflang="' + lang + '" lang="' + lang + '" xml:lang="' \
+ lang + '" href="/' + path + '">\n' + name + '</a>' \
+ emph_close + '<br /><!--#endif -->'
+ append_sitemap_org(directory, base, languages)
if len(langs) == 0:
return
translist = translist + '<!--#if expr="$qs = /,(' \
@@ -492,12 +581,11 @@
if depth_level != 0:
index_file = get_index_filename(directory) \
if LINK_TO_INDEX_FILES else None
- msgid = '<a' + title_class + ' href="/' \
- + directory + '/'
-
+ loc = directory + '/'
if index_file:
- msgid = msgid + index_file
- msgid = msgid + '">' + directory + '</a>'
+ loc = loc + index_file
+ msgid = '<a' + title_class + ' href="/' \
+ + loc + '">' + directory + '</a>'
write('\n<dl><dt>' + msgid + '</dt>\n <dd>')
append_sitemap_pos(msgid)
if index_file:
@@ -650,6 +738,33 @@
output_translations(OUTPUT_FILE_NAME)
+if len(sitemap_text):
+ print 'writing last sitemap (' + str(sitemap_no) + '): ' \
+ + str(len(sitemap_text)) + ' bytes, ' \
+ + str(sitemap_urls) + ' urls'
+ output_file = open(SITEMAP_BASE + str(sitemap_no) \
+ + SITEMAP_EXT, 'w')
+ sitemap_text = SITEMAP_ORG_HEADER + sitemap_text \
+ + SITEMAP_ORG_FOOTER
+ output_file.write(sitemap_text.encode('utf-8'))
+ output_file.close()
+ sitemap_no += 1
+
+if sitemap_no > 0:
+ print 'writing sitemap index'
+ sitemap_text = SITEMAP_IDX_HEADER
+ for i in range (sitemap_no):
+ sitemap_text += '<sitemap>\n'
+ sitemap_text += ' <loc>' + \
+ SITEMAP_URL + SITEMAP_BASE + str(i) \
+ + SITEMAP_EXT + SITEMAP_COMPRESSION + '</loc>\n'
+ sitemap_text += '</sitemap>\n'
+ sitemap_text += SITEMAP_IDX_FOOTER
+ output_file = open(SITEMAP_BASE + SITEMAP_IDX \
+ + SITEMAP_EXT, 'w')
+ output_file.write(sitemap_text.encode('utf-8'))
+ output_file.close()
+
if len(translist):
linguas = ''
for l in translation_linguas:
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- www/server/source/sitemap-generator sitemap-gen...,
Pavel Kharitonov <=