From b9dc98d21881ea1b2bf292962233d4177bbf2018 Mon Sep 17 00:00:00 2001 From: Maxime Petazzoni Date: Tue, 11 May 2010 17:10:51 +0200 Subject: [PATCH ocitysmap] German prefix rules Signed-off-by: Maxime Petazzoni --- ocitysmap/i18n.py | 76 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 files changed, 71 insertions(+), 5 deletions(-) diff --git a/ocitysmap/i18n.py b/ocitysmap/i18n.py index 37563cf..a9c1344 100644 --- a/ocitysmap/i18n.py +++ b/ocitysmap/i18n.py @@ -640,6 +640,72 @@ class i18n_pl_generic(i18n): return a == b +class i18n_de_generic(i18n): + # + # German streets are often named after people and include a title. + # The title will be captured as part of the + # Covering airport names and "New"/"Old" as prefixes as well + # + APPELLATIONS = [ u"Alte", u"Alter", u"Doktor", u"Dr.", + u"Flughafen", u"Flugplatz", u"Gen.,", u"General", + u"Neue", u"Neuer", u"Platz", + u"Prinz", u"Prinzessin", u"Prof.", + u"Professor" ] + # + # Surnames in german streets named after people tend to have the middle name + # listed after the rest of the surname, + # e.g. "Platz der deutschen Einheit" => "deutschen Einheit (Platz der)" + # Likewise, articles are captured as part of the prefix, + # e.g. "An der Märchenwiese" => "Märchenwiese (An der)" + # + DETERMINANTS = [ u"\s?An den", u"\s?An der", u"\s?Am", + u"\s?Auf den" , u"\s?Auf der" + u" an", u" des", u" der", u" von", u" vor"] + + SPACE_REDUCE = re.compile(r"\s+") + PREFIX_REGEXP = re.compile(r"^(?P(%s)(%s)?)\s?\b(?P.+)" % + ("|".join(APPELLATIONS), + "|".join(DETERMINANTS)), re.IGNORECASE + | re.UNICODE) + + # for IndexPageGenerator._upper_unaccent_string + E_ACCENT = re.compile(ur"[éèêëẽ]", re.IGNORECASE | re.UNICODE) + I_ACCENT = re.compile(ur"[íìîïĩ]", re.IGNORECASE | re.UNICODE) + A_ACCENT = re.compile(ur"[áàâäã]", re.IGNORECASE | re.UNICODE) + O_ACCENT = re.compile(ur"[óòôöõ]", re.IGNORECASE | re.UNICODE) + U_ACCENT = re.compile(ur"[úùûüũ]", re.IGNORECASE | re.UNICODE) + + def __init__(self, language, locale_path): + self.language = str(language) + _install_language(language, locale_path) + + def _upper_unaccent_string(self, s): + s = self.E_ACCENT.sub("e", s) + s = self.I_ACCENT.sub("i", s) + s = self.A_ACCENT.sub("a", s) + s = self.O_ACCENT.sub("o", s) + s = self.U_ACCENT.sub("u", s) + return s.upper() + + def language_code(self): + return self.language + + def user_readable_street(self, name): + # + # Make sure name actually contains something, + # the PREFIX_REGEXP.match fails on zero-length strings + # + if len(name) == 0: + return name + + name = name.strip() + name = self.SPACE_REDUCE.sub(" ", name) + name = self.PREFIX_REGEXP.sub(r"\g (\g)", name) + return name + + def first_letter_equal(self, a, b): + return self._upper_unaccent_string(a) == self._upper_unaccent_string(b) + class i18n_generic(i18n): def __init__(self, language, locale_path): self.language = str(language) @@ -678,15 +744,15 @@ language_class_map = { 'en_US.UTF-8': i18n_generic, 'en_ZA.UTF-8': i18n_generic, 'en_ZW.UTF-8': i18n_generic, - 'de_BE.UTF-8': i18n_generic, 'nl_BE.UTF-8': i18n_nl_generic, 'nl_NL.UTF-8': i18n_nl_generic, 'it_IT.UTF-8': i18n_it_generic, 'it_CH.UTF-8': i18n_it_generic, - 'de_AT.UTF-8': i18n_generic, - 'de_DE.UTF-8': i18n_generic, - 'de_LU.UTF-8': i18n_generic, - 'de_CH.UTF-8': i18n_generic, + 'de_AT.UTF-8': i18n_de_generic, + 'de_BE.UTF-8': i18n_de_generic, + 'de_DE.UTF-8': i18n_de_generic, + 'de_LU.UTF-8': i18n_de_generic, + 'de_CH.UTF-8': i18n_de_generic, 'es_ES.UTF-8': i18n_es_generic, 'es_AR.UTF-8': i18n_es_generic, 'es_BO.UTF-8': i18n_es_generic, -- 1.6.3.3.341.g9b22d