Source code for akvo.codelists.scripts.iati_codelist_generator

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Akvo RSR is covered by the GNU Affero General Public License. See more
# details in the license.txt file located at the root folder of the Akvo RSR
# module. For additional details on the GNU license please see
# <http://www.gnu.org/licenses/agpl.html>.

"""
Generate a python file, codelists_vXXX.py, in the codelists folder that contains a python representation of all
IATI codelists, based on the IATI version.
See http://iatistandard.org/codelists/ and http://iatistandard.org/codelists/code-list-api/
"""


import argparse
import json
from os.path import abspath, dirname, join
import re
import requests
import sys
import tempfile

from xml.etree import ElementTree

HERE = dirname(abspath(__file__))

# Modify this list to add new versions
VERSIONS = {
    "1.01": "http://codelists102.archive.iatistandard.org/data/",
    "1.02": "http://codelists102.archive.iatistandard.org/data/",
    "1.03": "http://codelists103.archive.iatistandard.org/data/",
    "1.04": "http://iatistandard.org/104/codelists/downloads/clv2/",
    "1.05": "http://iatistandard.org/105/codelists/downloads/clv2/",
    "2.01": "http://iatistandard.org/201/codelists/downloads/clv2/",
    "2.02": "http://iatistandard.org/202/codelists/downloads/clv2/",
    "2.03": "http://iatistandard.org/203/codelists/downloads/clv2/",
}

FIELDS_ORDER = ("category", "code", "name", "description", "url")

TRANSLATED_CODELISTS = {
    # 'AidType': [u"name"], # Very long descriptions!
    # 'ActivityScope': [u"name", u"description"],
    'ActivityStatus': ["name", "description"],
    # 'BudgetIdentifier': [u"name"],
    # 'BudgetIdentifierVocabulary': [u"name", u"description"],
    # 'BudgetStatus': [u"name", u"description"],
    # 'BudgetType': [u"name", u"description"],
    # one very long name, probably a data bug 'CollaborationType': [u"name", u"description"],
    # 'ConditionType': [u"name", u"description"],
    'ContactType': ["name", "description"],
    # 'CRSAddOtherFlags': [u"name", u"description"],
    # 'Currency': [u"name"],
    # 'Country': [u"name"],
    # 'DisbursementChannel': [u"name", u"description"],
    # 'FinanceType': [u"name"], # Very long descriptions!
    # 'DocumentCategory': [u"name", u"description"],
    # 'FlowType': [u"name", u"description"],
    # 'GeographicLocationClass': [u"name", u"description"],
    # 'GeographicLocationReach': [u"name", u"description"],
    # 'GeographicVocabulary': [u"name", u"description"],
    # 'HumanitarianScopeType': [u"name"],
    # 'HumanitarianScopeVocabulary': [u"name"],
    'IndicatorMeasure': ["name", "description"],
    # 'IndicatorVocabulary': [u"name"],
    # 'Language': [u"name"],
    # 'LoanRepaymentPeriod': [u"name", u"description"],
    # 'LoanRepaymentType': [u"name", u"description"],
    # 'LocationType': [u"name"],
    # 'OrganisationType': [u"name"],
    # 'PolicyMarker': [u"name", u"description"],
    # 'PolicySignificance': [u"name"],
    # 'Region': [u"name"],
    # 'RegionVocabulary': [u"name", u"description"],
    'RelatedActivityType': ["name", "description"],
    'ResultType': ["name", "description"],
    # 'Sector': [u"name"], # very long descriptions
    # 'SectorCategory': [u"name"], # very long descriptions
    'SectorVocabulary': ["name", "description"],
    # 'TiedStatus': [u"name", u"description"],
    # 'TransactionType': [u"name", u"description"],
}

JSON_CODELISTS = {
    # Section 1
    'AidType': {'path': 'section1/options/aid-types.json'},
    'AidTypeVocabulary': {'path': 'section1/options/aid-type-vocabulary.json'},
    'FinanceType': {'path': 'section1/options/finance-types.json'},
    'FlowType': {'path': 'section1/options/flow-types.json'},
    'TiedStatus': {'path': 'section1/options/tied-statuses.json', 'prefix-code': False},
    # Section 6
    'BudgetIdentifier': {'path': 'section6/country-budget-items/options.json'},
    'DisbursementChannel': {'path': 'section6/transactions/options/channels.json'},
    'TransactionType': {'path': 'section6/transactions/options/type-options.json', 'prefix-code': False},
    # Section 7
    'ActivityScope': {'path': 'section7/scope-options.json'},
    'GeographicVocabulary': {'path': 'section7/location-items/admin-vocab-options.json'},
    'LocationType': {'path': 'section7/location-items/feature-options.json'},
    'Region': {'path': 'section7/recipient-regions/regions.json'},
    # Section 8
    'Sector': {'path': 'section8/vocab-1-codes.json', 'indent': 2, 'separators': (',', ': '), 'prefix-code': False},
    'SectorCategory': {'path': 'section8/vocab-2-codes.json', 'prefix-code': False},
    'SectorVocabulary': {'path': 'section8/vocab.json'},
    'PolicySignificance': {'path': 'section8/policy-markers/significances.json'},
    'PolicyMarker': {'path': 'section8/policy-markers/markers.json', 'prefix-code': False},
    # Section 9
    'DocumentCategory': {'path': 'section9/docs/categories.json'},
    'FileFormat': {'path': 'section9/docs/formats.json'},
    'Language': {'path': 'section9/docs/languages.json', 'prefix-code': False},
    # Section 11
    'CRSChannelCode': {'path': 'section11/channel-codes.json'},
}

JSON_CODELISTS_PATH_PREFIX = 'akvo/rsr/spa/app/modules/editor/'

DOC_TEMPLATE = """# -*- coding: utf-8 -*-

from django.utils.translation import gettext_lazy as _

{codelists}
"""

CODELIST_TEMPLATE = """
# From {url}
{name} = (
    {field_names}
{rows}
)"""

STRING_BIT = '"{}"'
I18N_BIT = '_(u"{}")'


[docs]def pythonify_codelist_name(codelist_name): """Turn OrganisationType into ORGANISATION_TYPE""" bits = re.findall('[A-Z][^A-Z]*', codelist_name) return '_'.join(bits).upper().replace("-", "_")
[docs]def prettify_country_name(country): """ALL CAPS IS UGLY!""" country = country.lower() bits = [] previous = '' for bit in country.split(' '): # don't capitalize small words unless they follow a comma if bit not in ['the', 'and', 'of', 'da'] or previous[-1] == ',': bit = bit.capitalize() # special case fo U.S. if bit == 'U.s.': bit = 'U.S.' # Capitalize inside parentheses if bit[0] == '(': bit = "({}".format(bit[1:].capitalize()) # Fix hyphenated names if '-' in bit: bit = '-'.join([b.capitalize() for b in bit.split('-')]) bits.append(bit) previous = bit return ' '.join(bits)
[docs]def codelist_data(result, version, transform=None): """ Create a data structure with the following format: { 'fields: ['<field_name_1>', '<field_name_2>', ..., 'rows: [ { '<field_name_1>': '<codelist_value_1>, '<field_name_2>', '<codelist_value_2>, ... }, { ... } ] } """ tree = ElementTree.fromstring(result.text.encode('utf-8')) if version in ["1.01", "1.02", "1.03"]: items = tree else: items = tree.find('codelist-items').findall('codelist-item') rows = [] for item in items: row = {} fields = set() for field in list(item): # an attrib here indicates an alternative language, which we skip for now if not field.attrib: # we need to "collect" fields since not all items have all fields fields = fields.union({field.tag}) text = field.text.replace('\n', '').replace('\r', '') if field.text else '' if transform and transform['field'] == field.tag: text = transform['func'](text) row[field.tag] = text rows.append(row) return {'fields': fields, 'rows': rows}
[docs]def get_codelists(version, url): "Depending on the codelist version, retrieves the codelists" if version in ["1.01", "1.02", "1.03"]: codelists_url = url + "codelist.xml" codelist_url_template = url + "codelist/{}.xml" else: codelists_url = url + "codelists.xml" codelist_url_template = url + "xml/{}.xml" result = requests.get(codelists_url) codelists = [] if result.status_code == 200 and len(result.text) > 0: tree = ElementTree.fromstring(result.text) if version in ["1.01", "1.02", "1.03"]: for codelist in tree.iter('name'): codelists.append(codelist.text) else: for codelist in tree.iter('codelist'): if not codelist.attrib['ref'] in codelists: codelists.append(codelist.attrib['ref']) else: print("ERROR: Could not retrieve codelists from {}".format(codelists_url)) return codelist_url_template, sorted(codelists)
[docs]def generate_codelists_data(version): """ For each codelist extend the data structure returned from codelist_data with a the fields 'url' and 'name' and append to the list 'data' which is returned holding all data for all codelists """ codelist_url_template, codelist_names = get_codelists(version, VERSIONS[version]) data = [] for name in codelist_names: if name in ["IATIOrganisationIdentifier", ]: # Ignore some names which are not codelists continue url = codelist_url_template.format(name) result = requests.get(url) if not result.status_code == 200 or not len(result.text) > 0: # Couldn't fetch the result from the IATI site continue print("Gathering data for {}...".format(name)) if name == "Country": codelist_dict = codelist_data( result, version, {'field': 'name', 'func': prettify_country_name}) else: codelist_dict = codelist_data(result, version) # HACK: Backward compatibility hacks for some apparent hand-made # changes to the codelist file. # FIXME: Remove these hacks when updating the IATI standard version if name == 'FinanceType': codelist_dict['fields'].remove('description') for row in codelist_dict['rows'][::]: row.pop('description', None) if len(row['category']) > 3: codelist_dict['rows'].remove(row) elif name == 'CollaborationType': row = codelist_dict['rows'][-1] row['name'] = re.sub(r'\(.*\)', '', row['name']).strip() codelist_dict['url'] = url codelist_dict['name'] = name data.append(codelist_dict) return data
[docs]def data_to_strings(data): """ Use the data structure created in generate_codelists_data to assemble the string parts of the codelist document. """ codelists = [] for codelist in data: sorted_fields = sorted(codelist['fields'], key=lambda x: FIELDS_ORDER.index(x) if x in FIELDS_ORDER else 100 + ord(x[0])) url = codelist['url'] name = pythonify_codelist_name(codelist['name']) field_names = "({}),".format( ", ".join([STRING_BIT.format(field) for field in sorted_fields])) rows = [] for row in codelist['rows']: fields = [] for field in sorted_fields: text = row.get(field, '') # don't tag empty strings for translation if field in TRANSLATED_CODELISTS.get(codelist['name'], []) and text: template = I18N_BIT else: template = STRING_BIT fields.append(template.format(row.get(field, '').replace('"', '\\"'))) rows.append(" ({}),".format(", ".join(fields))) rows = "\n".join(rows) output = CODELIST_TEMPLATE.format( url=url, name=name, field_names=field_names, rows=rows ) codelists.append(output) if codelist['name'] in JSON_CODELISTS: write_codelist_json(codelist) return codelists
[docs]def get_translation_pairs(version, lang): codelist_url_template, _ = get_codelists(version, VERSIONS[version]) translations = [] for name, fields in sorted(TRANSLATED_CODELISTS.items()): url = codelist_url_template.format(name) result = requests.get(url) if not result.status_code == 200 or not len(result.text) > 0: # Couldn't fetch the result from the IATI site continue tree = ElementTree.fromstring(result.text.encode('utf-8')) items = ( tree if version in ["1.01", "1.02", "1.03"] else tree.find('codelist-items').findall('codelist-item') ) lang_attr = '{http://www.w3.org/XML/1998/namespace}lang' for item in items: for field in fields: values = item.findall(field) if len(values) <= 1: continue values = { value.get(lang_attr, 'en'): value.text for value in values } if lang in values: translations.append((values['en'], values[lang])) return translations
[docs]def get_translation_csv(version, lang='fr'): print('Getting translations for {}'.format(lang)) translations = get_translation_pairs(version, lang=lang) with open(tempfile.mktemp('.csv'), 'w') as f: for translation_pair in translations: f.write('"{}","{}"\n'.format(*translation_pair).encode('utf8')) print('Translations csv written to {}'.format(f.name))
[docs]def write_codelist_json(codelist, dry_run=False): name = codelist['name'] config = JSON_CODELISTS[name] def get_row_label(row, config): code = row['code'] name = row.get('name', code) prefix_code = config.get('prefix-code', True) and name != code label = f"{code} - {name}" if prefix_code else name return label data = [ {'value': row['code'], 'label': get_row_label(row, config)} for row in codelist['rows'] ] if config.get('add-empty', False): data.insert(0, {"value":"","label":"None"}) if dry_run: return data path = join(JSON_CODELISTS_PATH_PREFIX, config['path']) with open(path, 'w') as f: # FIXME: Set indent=0 so that the files are easily diffable # Not setting it right now, to reduce the changes with existing files indent = config.get('indent') separators = config.get('separators', (',', ':')) json.dump(data, f, separators=separators, ensure_ascii=False, indent=indent) return data
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-v", "--version", help="version, e.g. '1.01' (required)", required=True) parser.add_argument("-t", "--translate", help="translation language code e.g. 'fr'") args = parser.parse_args() # Version has to be one of the allowed versions if args.version not in VERSIONS: print("Error; Version should be one of the following:") for version in VERSIONS: print("- %s" % version) sys.exit(0) if args.translate: get_translation_csv(args.version, args.translate) sys.exit(9) data_dict = generate_codelists_data(args.version) identifiers = [pythonify_codelist_name(data['name']) for data in data_dict] strings = data_to_strings(data_dict) codelists = '\n'.join(strings) codelist_path = join(HERE, '..', 'store', "codelists_v%s.py" % args.version.replace(".", "")) with open(codelist_path, "w") as iati_file: iati_file.write('# -*- coding: utf-8 -*-\n\n') iati_file.write('from django.utils.translation import gettext_lazy as _\n\n') iati_file.writelines('codelist_list = [\n "{}"\n]\n'.format('",\n "'.join(identifiers))) iati_file.write(codelists) iati_file.write('\n')