#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Akvo RSR is covered by the GNU Affero General Public License. See more
# details in the license.txt file located at the root folder of the Akvo RSR
# module. For additional details on the GNU license please see
# <http://www.gnu.org/licenses/agpl.html>.
"""
Generate a python file, codelists_vXXX.py, in the codelists folder that contains a python representation of all
IATI codelists, based on the IATI version.
See http://iatistandard.org/codelists/ and http://iatistandard.org/codelists/code-list-api/
"""
import argparse
import json
from os.path import abspath, dirname, join
import re
import requests
import sys
import tempfile
from xml.etree import ElementTree
HERE = dirname(abspath(__file__))
# Modify this list to add new versions
VERSIONS = {
"1.01": "http://codelists102.archive.iatistandard.org/data/",
"1.02": "http://codelists102.archive.iatistandard.org/data/",
"1.03": "http://codelists103.archive.iatistandard.org/data/",
"1.04": "http://iatistandard.org/104/codelists/downloads/clv2/",
"1.05": "http://iatistandard.org/105/codelists/downloads/clv2/",
"2.01": "http://iatistandard.org/201/codelists/downloads/clv2/",
"2.02": "http://iatistandard.org/202/codelists/downloads/clv2/",
"2.03": "http://iatistandard.org/203/codelists/downloads/clv2/",
}
FIELDS_ORDER = ("category", "code", "name", "description", "url")
TRANSLATED_CODELISTS = {
# 'AidType': [u"name"], # Very long descriptions!
# 'ActivityScope': [u"name", u"description"],
'ActivityStatus': ["name", "description"],
# 'BudgetIdentifier': [u"name"],
# 'BudgetIdentifierVocabulary': [u"name", u"description"],
# 'BudgetStatus': [u"name", u"description"],
# 'BudgetType': [u"name", u"description"],
# one very long name, probably a data bug 'CollaborationType': [u"name", u"description"],
# 'ConditionType': [u"name", u"description"],
'ContactType': ["name", "description"],
# 'CRSAddOtherFlags': [u"name", u"description"],
# 'Currency': [u"name"],
# 'Country': [u"name"],
# 'DisbursementChannel': [u"name", u"description"],
# 'FinanceType': [u"name"], # Very long descriptions!
# 'DocumentCategory': [u"name", u"description"],
# 'FlowType': [u"name", u"description"],
# 'GeographicLocationClass': [u"name", u"description"],
# 'GeographicLocationReach': [u"name", u"description"],
# 'GeographicVocabulary': [u"name", u"description"],
# 'HumanitarianScopeType': [u"name"],
# 'HumanitarianScopeVocabulary': [u"name"],
'IndicatorMeasure': ["name", "description"],
# 'IndicatorVocabulary': [u"name"],
# 'Language': [u"name"],
# 'LoanRepaymentPeriod': [u"name", u"description"],
# 'LoanRepaymentType': [u"name", u"description"],
# 'LocationType': [u"name"],
# 'OrganisationType': [u"name"],
# 'PolicyMarker': [u"name", u"description"],
# 'PolicySignificance': [u"name"],
# 'Region': [u"name"],
# 'RegionVocabulary': [u"name", u"description"],
'RelatedActivityType': ["name", "description"],
'ResultType': ["name", "description"],
# 'Sector': [u"name"], # very long descriptions
# 'SectorCategory': [u"name"], # very long descriptions
'SectorVocabulary': ["name", "description"],
# 'TiedStatus': [u"name", u"description"],
# 'TransactionType': [u"name", u"description"],
}
JSON_CODELISTS = {
# Section 1
'AidType': {'path': 'section1/options/aid-types.json'},
'AidTypeVocabulary': {'path': 'section1/options/aid-type-vocabulary.json'},
'FinanceType': {'path': 'section1/options/finance-types.json'},
'FlowType': {'path': 'section1/options/flow-types.json'},
'TiedStatus': {'path': 'section1/options/tied-statuses.json', 'prefix-code': False},
# Section 6
'BudgetIdentifier': {'path': 'section6/country-budget-items/options.json'},
'DisbursementChannel': {'path': 'section6/transactions/options/channels.json'},
'TransactionType': {'path': 'section6/transactions/options/type-options.json', 'prefix-code': False},
# Section 7
'ActivityScope': {'path': 'section7/scope-options.json'},
'GeographicVocabulary': {'path': 'section7/location-items/admin-vocab-options.json'},
'LocationType': {'path': 'section7/location-items/feature-options.json'},
'Region': {'path': 'section7/recipient-regions/regions.json'},
# Section 8
'Sector': {'path': 'section8/vocab-1-codes.json', 'indent': 2, 'separators': (',', ': '), 'prefix-code': False},
'SectorCategory': {'path': 'section8/vocab-2-codes.json', 'prefix-code': False},
'SectorVocabulary': {'path': 'section8/vocab.json'},
'PolicySignificance': {'path': 'section8/policy-markers/significances.json'},
'PolicyMarker': {'path': 'section8/policy-markers/markers.json', 'prefix-code': False},
# Section 9
'DocumentCategory': {'path': 'section9/docs/categories.json'},
'FileFormat': {'path': 'section9/docs/formats.json'},
'Language': {'path': 'section9/docs/languages.json', 'prefix-code': False},
# Section 11
'CRSChannelCode': {'path': 'section11/channel-codes.json'},
}
JSON_CODELISTS_PATH_PREFIX = 'akvo/rsr/spa/app/modules/editor/'
DOC_TEMPLATE = """# -*- coding: utf-8 -*-
from django.utils.translation import gettext_lazy as _
{codelists}
"""
CODELIST_TEMPLATE = """
# From {url}
{name} = (
{field_names}
{rows}
)"""
STRING_BIT = '"{}"'
I18N_BIT = '_(u"{}")'
[docs]def pythonify_codelist_name(codelist_name):
"""Turn OrganisationType into ORGANISATION_TYPE"""
bits = re.findall('[A-Z][^A-Z]*', codelist_name)
return '_'.join(bits).upper().replace("-", "_")
[docs]def prettify_country_name(country):
"""ALL CAPS IS UGLY!"""
country = country.lower()
bits = []
previous = ''
for bit in country.split(' '):
# don't capitalize small words unless they follow a comma
if bit not in ['the', 'and', 'of', 'da'] or previous[-1] == ',':
bit = bit.capitalize()
# special case fo U.S.
if bit == 'U.s.':
bit = 'U.S.'
# Capitalize inside parentheses
if bit[0] == '(':
bit = "({}".format(bit[1:].capitalize())
# Fix hyphenated names
if '-' in bit:
bit = '-'.join([b.capitalize() for b in bit.split('-')])
bits.append(bit)
previous = bit
return ' '.join(bits)
[docs]def codelist_data(result, version, transform=None):
""" Create a data structure with the following format:
{
'fields: ['<field_name_1>', '<field_name_2>', ...,
'rows: [
{
'<field_name_1>': '<codelist_value_1>,
'<field_name_2>', '<codelist_value_2>,
...
},
{
...
}
]
}
"""
tree = ElementTree.fromstring(result.text.encode('utf-8'))
if version in ["1.01", "1.02", "1.03"]:
items = tree
else:
items = tree.find('codelist-items').findall('codelist-item')
rows = []
for item in items:
row = {}
fields = set()
for field in list(item):
# an attrib here indicates an alternative language, which we skip for now
if not field.attrib:
# we need to "collect" fields since not all items have all fields
fields = fields.union({field.tag})
text = field.text.replace('\n', '').replace('\r', '') if field.text else ''
if transform and transform['field'] == field.tag:
text = transform['func'](text)
row[field.tag] = text
rows.append(row)
return {'fields': fields, 'rows': rows}
[docs]def get_codelists(version, url):
"Depending on the codelist version, retrieves the codelists"
if version in ["1.01", "1.02", "1.03"]:
codelists_url = url + "codelist.xml"
codelist_url_template = url + "codelist/{}.xml"
else:
codelists_url = url + "codelists.xml"
codelist_url_template = url + "xml/{}.xml"
result = requests.get(codelists_url)
codelists = []
if result.status_code == 200 and len(result.text) > 0:
tree = ElementTree.fromstring(result.text)
if version in ["1.01", "1.02", "1.03"]:
for codelist in tree.iter('name'):
codelists.append(codelist.text)
else:
for codelist in tree.iter('codelist'):
if not codelist.attrib['ref'] in codelists:
codelists.append(codelist.attrib['ref'])
else:
print("ERROR: Could not retrieve codelists from {}".format(codelists_url))
return codelist_url_template, sorted(codelists)
[docs]def generate_codelists_data(version):
""" For each codelist extend the data structure returned from codelist_data with a the fields
'url' and 'name' and append to the list 'data' which is returned holding all data for all
codelists
"""
codelist_url_template, codelist_names = get_codelists(version, VERSIONS[version])
data = []
for name in codelist_names:
if name in ["IATIOrganisationIdentifier", ]:
# Ignore some names which are not codelists
continue
url = codelist_url_template.format(name)
result = requests.get(url)
if not result.status_code == 200 or not len(result.text) > 0:
# Couldn't fetch the result from the IATI site
continue
print("Gathering data for {}...".format(name))
if name == "Country":
codelist_dict = codelist_data(
result, version, {'field': 'name', 'func': prettify_country_name})
else:
codelist_dict = codelist_data(result, version)
# HACK: Backward compatibility hacks for some apparent hand-made
# changes to the codelist file.
# FIXME: Remove these hacks when updating the IATI standard version
if name == 'FinanceType':
codelist_dict['fields'].remove('description')
for row in codelist_dict['rows'][::]:
row.pop('description', None)
if len(row['category']) > 3:
codelist_dict['rows'].remove(row)
elif name == 'CollaborationType':
row = codelist_dict['rows'][-1]
row['name'] = re.sub(r'\(.*\)', '', row['name']).strip()
codelist_dict['url'] = url
codelist_dict['name'] = name
data.append(codelist_dict)
return data
[docs]def data_to_strings(data):
""" Use the data structure created in generate_codelists_data to assemble the string parts of
the codelist document.
"""
codelists = []
for codelist in data:
sorted_fields = sorted(codelist['fields'],
key=lambda x: FIELDS_ORDER.index(x) if x in FIELDS_ORDER else 100 + ord(x[0]))
url = codelist['url']
name = pythonify_codelist_name(codelist['name'])
field_names = "({}),".format(
", ".join([STRING_BIT.format(field) for field in sorted_fields]))
rows = []
for row in codelist['rows']:
fields = []
for field in sorted_fields:
text = row.get(field, '')
# don't tag empty strings for translation
if field in TRANSLATED_CODELISTS.get(codelist['name'], []) and text:
template = I18N_BIT
else:
template = STRING_BIT
fields.append(template.format(row.get(field, '').replace('"', '\\"')))
rows.append(" ({}),".format(", ".join(fields)))
rows = "\n".join(rows)
output = CODELIST_TEMPLATE.format(
url=url,
name=name,
field_names=field_names,
rows=rows
)
codelists.append(output)
if codelist['name'] in JSON_CODELISTS:
write_codelist_json(codelist)
return codelists
[docs]def get_translation_pairs(version, lang):
codelist_url_template, _ = get_codelists(version, VERSIONS[version])
translations = []
for name, fields in sorted(TRANSLATED_CODELISTS.items()):
url = codelist_url_template.format(name)
result = requests.get(url)
if not result.status_code == 200 or not len(result.text) > 0:
# Couldn't fetch the result from the IATI site
continue
tree = ElementTree.fromstring(result.text.encode('utf-8'))
items = (
tree if version in ["1.01", "1.02", "1.03"] else
tree.find('codelist-items').findall('codelist-item')
)
lang_attr = '{http://www.w3.org/XML/1998/namespace}lang'
for item in items:
for field in fields:
values = item.findall(field)
if len(values) <= 1:
continue
values = {
value.get(lang_attr, 'en'): value.text for value in values
}
if lang in values:
translations.append((values['en'], values[lang]))
return translations
[docs]def get_translation_csv(version, lang='fr'):
print('Getting translations for {}'.format(lang))
translations = get_translation_pairs(version, lang=lang)
with open(tempfile.mktemp('.csv'), 'w') as f:
for translation_pair in translations:
f.write('"{}","{}"\n'.format(*translation_pair).encode('utf8'))
print('Translations csv written to {}'.format(f.name))
[docs]def write_codelist_json(codelist, dry_run=False):
name = codelist['name']
config = JSON_CODELISTS[name]
def get_row_label(row, config):
code = row['code']
name = row.get('name', code)
prefix_code = config.get('prefix-code', True) and name != code
label = f"{code} - {name}" if prefix_code else name
return label
data = [
{'value': row['code'], 'label': get_row_label(row, config)}
for row in codelist['rows']
]
if config.get('add-empty', False):
data.insert(0, {"value":"","label":"None"})
if dry_run:
return data
path = join(JSON_CODELISTS_PATH_PREFIX, config['path'])
with open(path, 'w') as f:
# FIXME: Set indent=0 so that the files are easily diffable
# Not setting it right now, to reduce the changes with existing files
indent = config.get('indent')
separators = config.get('separators', (',', ':'))
json.dump(data, f, separators=separators, ensure_ascii=False, indent=indent)
return data
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--version", help="version, e.g. '1.01' (required)", required=True)
parser.add_argument("-t", "--translate", help="translation language code e.g. 'fr'")
args = parser.parse_args()
# Version has to be one of the allowed versions
if args.version not in VERSIONS:
print("Error; Version should be one of the following:")
for version in VERSIONS:
print("- %s" % version)
sys.exit(0)
if args.translate:
get_translation_csv(args.version, args.translate)
sys.exit(9)
data_dict = generate_codelists_data(args.version)
identifiers = [pythonify_codelist_name(data['name']) for data in data_dict]
strings = data_to_strings(data_dict)
codelists = '\n'.join(strings)
codelist_path = join(HERE, '..', 'store', "codelists_v%s.py" % args.version.replace(".", ""))
with open(codelist_path, "w") as iati_file:
iati_file.write('# -*- coding: utf-8 -*-\n\n')
iati_file.write('from django.utils.translation import gettext_lazy as _\n\n')
iati_file.writelines('codelist_list = [\n "{}"\n]\n'.format('",\n "'.join(identifiers)))
iati_file.write(codelists)
iati_file.write('\n')