From b9dc98d21881ea1b2bf292962233d4177bbf2018 Mon Sep 17 00:00:00 2001
From: Maxime Petazzoni <address@hidden>
Date: Tue, 11 May 2010 17:10:51 +0200
Subject: [PATCH ocitysmap] German prefix rules

Signed-off-by: Maxime Petazzoni <address@hidden>
---
 ocitysmap/i18n.py |   76 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 71 insertions(+), 5 deletions(-)
diff --git a/ocitysmap/i18n.py b/ocitysmap/i18n.py
index 37563cf..a9c1344 100644
--- a/ocitysmap/i18n.py
+++ b/ocitysmap/i18n.py
@@ -640,6 +640,72 @@ class i18n_pl_generic(i18n):
         return a == b
 
 
+class i18n_de_generic(i18n):
+    #
+    # German streets are often named after people and include a title.
+    # The title will be captured as part of the <prefix>
+	# Covering airport names and "New"/"Old" as prefixes as well
+    #
+    APPELLATIONS = [ u"Alte", u"Alter", u"Doktor", u"Dr.",
+					 u"Flughafen", u"Flugplatz", u"Gen.,", u"General",
+					 u"Neue", u"Neuer", u"Platz",
+					 u"Prinz", u"Prinzessin", u"Prof.",
+					 u"Professor" ]
+    #
+    # Surnames in german streets named after people tend to have the middle name
+    # listed after the rest of the surname,
+    # e.g. "Platz der deutschen Einheit" => "deutschen Einheit (Platz der)"
+    # Likewise, articles are captured as part of the prefix,
+    # e.g. "An der Märchenwiese" => "Märchenwiese (An der)"
+    #
+    DETERMINANTS = [ u"\s?An den", u"\s?An der", u"\s?Am",
+					 u"\s?Auf den" , u"\s?Auf der"
+					 u" an", u" des", u" der", u" von", u" vor"]
+
+    SPACE_REDUCE = re.compile(r"\s+")
+    PREFIX_REGEXP = re.compile(r"^(?P<prefix>(%s)(%s)?)\s?\b(?P<name>.+)" %
+                                    ("|".join(APPELLATIONS),
+                                     "|".join(DETERMINANTS)), re.IGNORECASE
+                                                                 | re.UNICODE)
+
+    # for IndexPageGenerator._upper_unaccent_string
+    E_ACCENT = re.compile(ur"[éèêëẽ]", re.IGNORECASE | re.UNICODE)
+    I_ACCENT = re.compile(ur"[íìîïĩ]", re.IGNORECASE | re.UNICODE)
+    A_ACCENT = re.compile(ur"[áàâäã]", re.IGNORECASE | re.UNICODE)
+    O_ACCENT = re.compile(ur"[óòôöõ]", re.IGNORECASE | re.UNICODE)
+    U_ACCENT = re.compile(ur"[úùûüũ]", re.IGNORECASE | re.UNICODE)
+
+    def __init__(self, language, locale_path):
+        self.language = str(language)
+        _install_language(language, locale_path)
+
+    def _upper_unaccent_string(self, s):
+        s = self.E_ACCENT.sub("e", s)
+        s = self.I_ACCENT.sub("i", s)
+        s = self.A_ACCENT.sub("a", s)
+        s = self.O_ACCENT.sub("o", s)
+        s = self.U_ACCENT.sub("u", s)
+        return s.upper()
+
+    def language_code(self):
+        return self.language
+
+    def user_readable_street(self, name):
+        #
+        # Make sure name actually contains something,
+        # the PREFIX_REGEXP.match fails on zero-length strings
+        #
+        if len(name) == 0:
+            return name
+
+        name = name.strip()
+        name = self.SPACE_REDUCE.sub(" ", name)
+        name = self.PREFIX_REGEXP.sub(r"\g<name> (\g<prefix>)", name)
+        return name
+
+    def first_letter_equal(self, a, b):
+        return self._upper_unaccent_string(a) == self._upper_unaccent_string(b)
+
 class i18n_generic(i18n):
     def __init__(self, language, locale_path):
         self.language = str(language)
@@ -678,15 +744,15 @@ language_class_map = {
     'en_US.UTF-8': i18n_generic,
     'en_ZA.UTF-8': i18n_generic,
     'en_ZW.UTF-8': i18n_generic,
-    'de_BE.UTF-8': i18n_generic,
     'nl_BE.UTF-8': i18n_nl_generic,
     'nl_NL.UTF-8': i18n_nl_generic,
     'it_IT.UTF-8': i18n_it_generic,
     'it_CH.UTF-8': i18n_it_generic,
-    'de_AT.UTF-8': i18n_generic,
-    'de_DE.UTF-8': i18n_generic,
-    'de_LU.UTF-8': i18n_generic,
-    'de_CH.UTF-8': i18n_generic,
+    'de_AT.UTF-8': i18n_de_generic,
+    'de_BE.UTF-8': i18n_de_generic,
+    'de_DE.UTF-8': i18n_de_generic,
+    'de_LU.UTF-8': i18n_de_generic,
+    'de_CH.UTF-8': i18n_de_generic,
     'es_ES.UTF-8': i18n_es_generic,
     'es_AR.UTF-8': i18n_es_generic,
     'es_BO.UTF-8': i18n_es_generic,
-- 
1.6.3.3.341.g9b22d