From fbc824f4f43ea756fab011e2e497945f674df339 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 8 Mar 2025 12:50:10 -0500 Subject: [PATCH 01/10] implement schema.org support (#231) --- pygeometa/schemas/schema_org/__init__.py | 582 +++++++++++++++++++++++ 1 file changed, 582 insertions(+) create mode 100644 pygeometa/schemas/schema_org/__init__.py diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py new file mode 100644 index 0000000..764a4ed --- /dev/null +++ b/pygeometa/schemas/schema_org/__init__.py @@ -0,0 +1,582 @@ +# ================================================================= +# +# Terms and Conditions of Use +# +# Unless otherwise noted, computer program source code of this +# distribution # is covered under Crown Copyright, Government of +# Canada, and is distributed under the MIT License. +# +# The Canada wordmark and related graphics associated with this +# distribution are protected under trademark law and copyright law. +# No permission is granted to use them outside the parameters of +# the Government of Canada's corporate identity program. For +# more information, see +# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp +# +# Copyright title to all 3rd party software distributed with this +# software is held by the respective copyright holders as noted in +# those files. Users are asked to read the 3rd Party Licenses +# referenced with those assets. +# +# Copyright (c) 2025 Tom Kralidis +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +from datetime import date, datetime +import json +import logging +import os +from typing import Union + +from pygeometa.core import get_charstring +from pygeometa.helpers import json_dumps +from pygeometa.schemas.base import BaseOutputSchema + +THISDIR = os.path.dirname(os.path.realpath(__file__)) + +LOGGER = logging.getLogger(__name__) + +TYPES = { + 'Series': 'series', + 'SoftwareApplication': 'software', + 'ProductModel': 'model', + 'Dataset': 'dataset', + 'WebAPI': 'service', + 'Property': 'attribute', + 'ListItem': 'feature' +} + + +class SchemaOrgOutputSchema(BaseOutputSchema): + """Schema.org schema""" + + def __init__(self): + """ + Initialize object + + :returns: pygeometa.schemas.base.BaseOutputSchema + """ + + description = 'Schema.org' + + super().__init__('schema-org', description, 'json', THISDIR) + + def import_(self, metadata: str) -> dict: + """ + Import metadata into MCF + + :param metadata: string of metadata content + + :returns: `dict` of MCF content + """ + + md = json.loads(metadata) + + mcf = { + 'mcf': { + 'version': '1.0', + }, + 'metadata': {}, + 'identification': { + 'extents': { + 'spatial': [] + } + }, + 'contact': {}, + 'distribution': {} + } + + mcf['metadata']['identifier'] = md['identifier'] + mcf['metadata']['charset'] = 'utf-8' + mcf['metadata']['type'] = TYPES[md.get('type', 'Dataset')] + mcf['metadata']['language'] = md.get('inLanguage', 'en') + + if 'spatialCoverage' in md or 'spatial' in md: + crs = 4326 + geo = md['spatialCoverage']['geo'] + if geo['@type'] == 'GeoCoordinates': + mcf['spatial']['datatype'] = 'vector' + mcf['spatial']['geomtype'] = 'point' + bbox = [geo['longitude'], geo['latitude'], + geo['longitude'], geo['latitude']] + elif geo['@type'] == 'GeoShape': + mcf['spatial']['datatype'] = 'vector' + mcf['spatial']['geomtype'] = 'polygon' + bbox = geo['box'].split() + + mcf['identification']['extents']['spatial'].append({ + 'bbox': bbox, + 'crs': crs + }) + + if 'temporalCoverage' in md: + begin, end = md['temporalCoverage'].split('/') + mcf['identification']['extents']['temporal'] = [{ + 'begin': begin, + 'end': end + }] + + mcf['identification']['language'] = mcf['metadata']['language'] + mcf['identification']['title'] = md['name'] + mcf['identification']['abstract'] = md['description'] + + if 'dateCreated' in md: + mcf['metadata']['identification']['creation'] = md['datePublished'] + if 'datePublished' in md: + mcf['metadata']['identification']['publication'] = md['datePublished'] # noqa + if 'dateModified' in md: + mcf['metadata']['identification']['revision'] = md['dateModified'] + + if 'version' in md: + mcf['metadata']['identification']['edition'] = md['version'] + + mcf['identification']['keywords'] = { + 'default': { + 'keywords': md['keywords'] + } + } + + for dist in md['distribution']: + mcf['distribution'][dist['name']] = { + 'name': dist['name'], + 'type': dist['encodingFormat'], + 'url': dist['contentUrl'], + 'rel': 'download', + 'function': 'download' + } + + for ct in ['author', 'publisher', 'creator', 'provider', 'funder']: + if ct in md: + contact = {} + contact['url'] = md[ct]['url'] + contact['individualname'] = md[ct]['name'] + if md[ct]['@type'] == 'Organization': + contact['organization'] = md[ct]['name'] + + if 'address' in md[ct]: + contact['address'] = md[ct]['streetAddress'] + contact['city'] = md[ct]['addressLocality'] + contact['administrativearea'] = md[ct]['addressRegion'] + contact['postalcode'] = md[ct]['postalCode'] + contact['country'] = md[ct]['addressCountry'] + + if 'contactPoint' in md[ct]: + cp = md[ct][0] + contact['email'] = cp['email'] + contact['fax'] = cp['fax'] + + mcf['contact'][ct] = contact + + return mcf + + def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: + """ + Write outputschema to JSON string buffer + + :param mcf: dict of MCF content model + :param stringify: whether to return a string representation (default) + else native (dict, etree) + + + :returns: `dict` or `str` of MCF as an OARec record representation + """ + + self.lang1 = mcf['metadata'].get('language') + self.lang2 = mcf['metadata'].get('language_alternate') + + minx, miny, maxx, maxy = (mcf['identification']['extents'] + ['spatial'][0]['bbox']) + + title = get_charstring(mcf['identification'].get('title'), + self.lang1, self.lang2) + + description = get_charstring(mcf['identification'].get('abstract'), + self.lang1, self.lang2) + + LOGGER.debug('Generating baseline record') + record = { + 'id': mcf['metadata']['identifier'], + 'conformsTo': [ + 'http://www.opengis.net/spec/ogcapi-records-1/1.0/conf/record-core', # noqa + ], + 'type': 'Feature', + 'geometry': { + 'type': 'Polygon', + 'coordinates': [[ + [minx, miny], + [minx, maxy], + [maxx, maxy], + [maxx, miny], + [minx, miny] + ]] + }, + 'properties': { + 'title': title[0], + 'description': description[0], + 'themes': [], + 'type': mcf['metadata']['hierarchylevel'], + }, + 'links': [] + } + + if self.lang1 is not None: + record['properties']['language'] = { + 'code': self.lang1 + } + + LOGGER.debug('Checking for temporal') + try: + begin = mcf['identification']['extents']['temporal'][0]['begin'] + end = mcf['identification']['extents']['temporal'][0].get('end') + + if begin in ['now', 'None', None]: + begin = '..' + else: + begin = str(begin) + + if end in ['now', 'None', None]: + end = '..' + else: + end = str(end) + + if [begin, end] == [None, None]: + record['time'] = None + elif [begin, end] == ['..', '..']: + pass + else: + record['time'] = { + 'interval': [begin, end] + } + + if 'resolution' in mcf['identification']['extents']['temporal'][0]: # noqa + record['time']['resolution'] = mcf['identification']['extents']['temporal'][0]['resolution'] # noqa + + except (IndexError, KeyError): + record['time'] = None + + LOGGER.debug('Checking for dates') + + if 'dates' in mcf['identification']: + if 'creation' in mcf['identification']['dates']: + record['properties']['created'] = self.generate_date(mcf['identification']['dates']['creation']) # noqa + + if 'revision' in mcf['identification']['dates']: + record['properties']['updated'] = self.generate_date(mcf['identification']['dates']['revision']) # noqa + + rights = get_charstring(mcf['identification'].get('rights'), + self.lang1, self.lang2) + + if rights != [None, None]: + record['properties']['rights'] = rights[0] + + formats = [] + for v in mcf['distribution'].values(): + format_ = get_charstring(v.get('format'), self.lang1, self.lang2) + if format_[0] is not None: + formats.append(format_[0]) + + LOGGER.debug('Checking for formats') + if formats: + formats2 = set(formats) + record['properties']['formats'] = [{'name': f} for f in formats2] + + LOGGER.debug('Checking for contacts') + record['properties']['contacts'] = self.generate_contacts( + mcf['contact']) + + all_keywords = [] + + LOGGER.debug('Checking for keywords') + for key, value in mcf['identification']['keywords'].items(): + theme = {'concepts': []} + scheme = None + + keywords = get_charstring(value.get('keywords'), self.lang1, + self.lang2) + + if 'vocabulary' in value: + if 'url' in value['vocabulary']: + scheme = value['vocabulary']['url'] + elif 'name' in value['vocabulary']: + scheme = value['vocabulary']['name'] + + if scheme is None: + LOGGER.debug('Keywords found without vocabulary') + LOGGER.debug('Aggregating as bare keywords') + all_keywords.extend(keywords[0]) + else: + LOGGER.debug('Adding as theme/concepts') + for kw in keywords[0]: + theme['concepts'].append({'id': kw}) + + theme['scheme'] = scheme + + record['properties']['themes'].append(theme) + + if all_keywords: + record['properties']['keywords'] = all_keywords + + if not record['properties']['themes']: + _ = record['properties'].pop('themes', None) + + LOGGER.debug('Checking for licensing') + if mcf['identification'].get('license') is not None: + license = mcf['identification']['license'] + + if 'url' in license: + LOGGER.debug('Encoding license as link') + license_link = { + 'rel': 'license', + 'type': 'text/html', + 'title': license.get('name', 'license for this resource'), + 'url': license['url'] + } + record['links'].append(self.generate_link(license_link)) + else: + LOGGER.debug('Encoding license as property') + record['properties']['license'] = license['name'] + + LOGGER.debug('Checking for distribution') + for value in mcf['distribution'].values(): + record['links'].append(self.generate_link(value)) + + if stringify: + return json_dumps(record) + + return record + + def generate_party(self, contact: dict, + lang1: str, lang2: str, roles: list) -> dict: + """ + generate party construct from MCF contact + + :param contact: dict of MCF contact + :param self.lang1: primary language + :param self.lang2: alternate language + :param roles: roles of contact + + :returns: MCF contact as a party representation + """ + + organization_name = get_charstring(contact.get('organization'), + self.lang1, self.lang2) + + individual_name = get_charstring(contact.get('individualname'), + self.lang1, self.lang2) + + position_name = get_charstring(contact.get('positionname'), + self.lang1, self.lang2) + + hours_of_service = get_charstring(contact.get('hoursofservice'), + self.lang1, self.lang2) + + contact_instructions = get_charstring( + contact.get('contactinstructions'), self.lang1, self.lang2) + + address = get_charstring(contact.get('address'), + self.lang1, self.lang2) + + city = get_charstring(contact.get('city'), self.lang1, self.lang2) + + administrative_area = get_charstring(contact.get('administrativearea'), + self.lang1, self.lang2) + + postalcode = get_charstring(contact.get('postalcode'), + self.lang1, self.lang2) + + country = get_charstring(contact.get('country'), + self.lang1, self.lang2) + + rp = { + 'addresses': [{}], + 'roles': [] + } + + if organization_name[0] is not None: + rp['organization'] = organization_name[0] + if individual_name[0] is not None: + rp['name'] = individual_name[0] + if position_name[0] is not None: + rp['position'] = position_name[0] + if hours_of_service[0] is not None: + rp['hoursOfService'] = hours_of_service[0] + if contact_instructions[0] is not None: + rp['contactInstructions'] = contact_instructions[0] + + if address[0] is not None: + rp['addresses'][0]['deliveryPoint'] = [address[0]] + if city[0] is not None: + rp['addresses'][0]['city'] = city[0] + if administrative_area[0] is not None: + rp['addresses'][0]['administrativeArea'] = administrative_area[0] + if postalcode[0] is not None: + rp['addresses'][0]['postalCode'] = postalcode[0] + if country[0] is not None: + rp['addresses'][0]['country'] = country[0] + + if contact.get('phone') is not None: + LOGGER.debug('Formatting phone number') + phone = contact['phone'] + phone = phone.replace('-', '').replace('(', '').replace(')', '') + phone = phone.replace('+0', '+').replace(' ', '') + + rp['phones'] = [{'value': phone}] + + if contact.get('email') is not None: + rp['emails'] = [{'value': contact.get('email')}] + + if rp['addresses'][0] == {}: + rp.pop('addresses') + + for r in set(roles): + rp['roles'].append(r) + + if 'url' in contact: + rp['links'] = [{ + 'rel': 'canonical', + 'type': 'text/html', + 'href': contact['url'] + }] + + return rp + + def generate_contacts(self, contact: dict) -> list: + """ + Generates 1..n contacts, streamlining identical + contacts with multiple roles + + :param contact: `dict` of contacts + + :returns: `list` of contacts + """ + + contacts = [] + contacts2 = [] + + for key, value in contact.items(): + if contacts: + for c in contacts: + if value == c['contact']: + LOGGER.debug('Found matching contact; adding role') + c['roles'].append(key) + else: + LOGGER.debug('Adding contact') + contacts.append({ + 'contact': value, + 'roles': [key] + }) + else: + contacts.append({ + 'contact': value, + 'roles': [key] + }) + + LOGGER.debug(f'Contacts: {contacts}') + for c in contacts: + contacts2.append(self.generate_party(c['contact'], self.lang1, + self.lang2, c['roles'])) + + return contacts2 + + def generate_link(self, distribution: dict) -> dict: + """ + Generates OARec link object from MCF distribution object + + :param distribution: `dict` of MCF distribution + + :returns: OARec link object + """ + + title = get_charstring(distribution.get('title'), + self.lang1, self.lang2) + + name = get_charstring(distribution.get('name'), self.lang1, self.lang2) + + link = { + 'href': distribution['url'] + } + + if distribution.get('type') is not None: + link['type'] = distribution['type'] + + reltype = distribution.get('rel') or distribution.get('function') + if reltype is not None: + link['rel'] = reltype + + if title != [None, None]: + link['title'] = title[0] + elif name != [None, None]: + link['title'] = name[0] + + if all(x in distribution['url'] for x in ['{', '}']): + link['templated'] = True + + if 'channel' in distribution: + link['channel'] = distribution['channel'] + + return link + + def generate_date(self, date_value: str) -> str: + """ + Helper function to derive RFC3339 date from MCF date type + + :param date_value: `str` of date value + + :returns: `str` of date-time value + """ + + value = None + + if isinstance(date_value, str) and date_value != 'None': + if len(date_value) == 10: # YYYY-MM-DD + format_ = '%Y-%m-%d' + elif len(date_value) == 7: # YYYY-MM + format_ = '%Y-%m' + elif len(date_value) == 4: # YYYY + format_ = '%Y' + elif len(date_value) == 19: # YYYY-MM-DDTHH:MM:SS + msg = 'YYYY-MM-DDTHH:MM:SS with no timezone; converting to UTC' + LOGGER.debug(msg) + format_ = '%Y-%m-%dT%H:%M:%S' + + LOGGER.debug('date type found; expanding to date-time') + value = datetime.strptime(date_value, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa + + elif isinstance(date_value, int) and len(str(date_value)) == 4: + date_value2 = str(date_value) + LOGGER.debug('date type found; expanding to date-time') + format_ = '%Y' + value = datetime.strptime(date_value2, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa + + elif isinstance(date_value, (date, datetime)): + value = date_value.strftime('%Y-%m-%dT%H:%M:%SZ') + + elif date_value in [None, 'None']: + value = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + + else: + msg = f'Unknown date string: {date_value}' + raise RuntimeError(msg) + + return value From c680a702a6f4d198cda5b5e301b47fdb49881709 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 8 Mar 2025 12:50:38 -0500 Subject: [PATCH 02/10] implement schema.org support (#231) --- pygeometa/schemas/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py index 3bdd6bf..5372fb4 100644 --- a/pygeometa/schemas/__init__.py +++ b/pygeometa/schemas/__init__.py @@ -53,12 +53,13 @@ THISDIR = os.path.dirname(os.path.realpath(__file__)) SCHEMAS = { + 'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema', 'iso19139': 'pygeometa.schemas.iso19139.ISO19139OutputSchema', 'iso19139-2': 'pygeometa.schemas.iso19139_2.ISO19139_2OutputSchema', 'iso19139-hnap': 'pygeometa.schemas.iso19139_hnap.ISO19139HNAPOutputSchema', # noqa 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa + 'schema-org': 'pygeometa.schemas.schema_org.SchemaOrgOutputSchema', 'stac-item': 'pygeometa.schemas.stac.STACItemOutputSchema', - 'dcat': 'pygeometa.schemas.dcat.DCATOutputSchema', 'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema', 'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema', 'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema', From a7e97ce104be36d91d2c79d0ed4f67cf39257a68 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sun, 9 Mar 2025 06:54:29 -0400 Subject: [PATCH 03/10] fix tests --- tests/run_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/run_tests.py b/tests/run_tests.py index 4c89283..3ceb6c0 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -226,17 +226,17 @@ def test_get_supported_schemas(self): schemas = sorted(get_supported_schemas()) self.assertIsInstance(schemas, list, 'Expected list') - self.assertEqual(len(schemas), 10, + self.assertEqual(len(schemas), 11, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), sorted(['cwl', 'dcat', 'iso19139', 'iso19139-2', - 'iso19139-hnap', 'oarec-record', + 'iso19139-hnap', 'oarec-record', 'schema-org', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') schemas = get_supported_schemas(include_autodetect=True) - self.assertEqual(len(schemas), 11, + self.assertEqual(len(schemas), 12, 'Expected specific number of supported schemas') self.assertIn('autodetect', schemas, 'Expected autodetect in list') From ad16802ac52c8aeddcc9d1cc9f15c9cd9773cab5 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 15 Mar 2025 07:23:32 -0400 Subject: [PATCH 04/10] refactor datetime generation --- pygeometa/helpers.py | 45 +++++ pygeometa/schemas/ogcapi_records/__init__.py | 58 +------ pygeometa/schemas/schema_org/__init__.py | 172 +++++-------------- 3 files changed, 91 insertions(+), 184 deletions(-) diff --git a/pygeometa/helpers.py b/pygeometa/helpers.py index 916750e..dc07485 100644 --- a/pygeometa/helpers.py +++ b/pygeometa/helpers.py @@ -93,3 +93,48 @@ def json_serial(obj) -> Any: msg = f'{obj} type {type(obj)} not serializable' LOGGER.error(msg) raise TypeError(msg) + + +def generate_datetime(date_value: str) -> str: + """ + Helper function to derive RFC3339 date from MCF date type + + :param date_value: `str` of date value + + :returns: `str` of date-time value + """ + + value = None + + if isinstance(date_value, str) and date_value != 'None': + if len(date_value) == 10: # YYYY-MM-DD + format_ = '%Y-%m-%d' + elif len(date_value) == 7: # YYYY-MM + format_ = '%Y-%m' + elif len(date_value) == 4: # YYYY + format_ = '%Y' + elif len(date_value) == 19: # YYYY-MM-DDTHH:MM:SS + msg = 'YYYY-MM-DDTHH:MM:SS with no timezone; converting to UTC' + LOGGER.debug(msg) + format_ = '%Y-%m-%dT%H:%M:%S' + + LOGGER.debug('date type found; expanding to date-time') + value = datetime.strptime(date_value, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa + + elif isinstance(date_value, int) and len(str(date_value)) == 4: + date_value2 = str(date_value) + LOGGER.debug('date type found; expanding to date-time') + format_ = '%Y' + value = datetime.strptime(date_value2, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa + + elif isinstance(date_value, (date, datetime)): + value = date_value.strftime('%Y-%m-%dT%H:%M:%SZ') + + elif date_value in [None, 'None']: + value = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') + + else: + msg = f'Unknown date string: {date_value}' + raise RuntimeError(msg) + + return value diff --git a/pygeometa/schemas/ogcapi_records/__init__.py b/pygeometa/schemas/ogcapi_records/__init__.py index bee01ba..819bfc5 100644 --- a/pygeometa/schemas/ogcapi_records/__init__.py +++ b/pygeometa/schemas/ogcapi_records/__init__.py @@ -43,14 +43,13 @@ # # ================================================================= -from datetime import date, datetime import logging import os from typing import Union from pygeometa import __version__ from pygeometa.core import get_charstring -from pygeometa.helpers import json_dumps +from pygeometa.helpers import generate_datetime, json_dumps from pygeometa.schemas.base import BaseOutputSchema THISDIR = os.path.dirname(os.path.realpath(__file__)) @@ -165,12 +164,11 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: LOGGER.debug('Checking for dates') - if 'dates' in mcf['identification']: - if 'creation' in mcf['identification']['dates']: - record['properties']['created'] = self.generate_date(mcf['identification']['dates']['creation']) # noqa - - if 'revision' in mcf['identification']['dates']: - record['properties']['updated'] = self.generate_date(mcf['identification']['dates']['revision']) # noqa + for key, value in mcf['identification']['dates'].items(): + if key == 'creation': + record['properties']['created'] = generate_datetime(value) + elif key == 'revision': + record['properties']['updated'] = generate_datetime(value) rights = get_charstring(mcf['identification'].get('rights'), self.lang1, self.lang2) @@ -426,47 +424,3 @@ def generate_link(self, distribution: dict) -> dict: link['channel'] = distribution['channel'] return link - - def generate_date(self, date_value: str) -> str: - """ - Helper function to derive RFC3339 date from MCF date type - - :param date_value: `str` of date value - - :returns: `str` of date-time value - """ - - value = None - - if isinstance(date_value, str) and date_value != 'None': - if len(date_value) == 10: # YYYY-MM-DD - format_ = '%Y-%m-%d' - elif len(date_value) == 7: # YYYY-MM - format_ = '%Y-%m' - elif len(date_value) == 4: # YYYY - format_ = '%Y' - elif len(date_value) == 19: # YYYY-MM-DDTHH:MM:SS - msg = 'YYYY-MM-DDTHH:MM:SS with no timezone; converting to UTC' - LOGGER.debug(msg) - format_ = '%Y-%m-%dT%H:%M:%S' - - LOGGER.debug('date type found; expanding to date-time') - value = datetime.strptime(date_value, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa - - elif isinstance(date_value, int) and len(str(date_value)) == 4: - date_value2 = str(date_value) - LOGGER.debug('date type found; expanding to date-time') - format_ = '%Y' - value = datetime.strptime(date_value2, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa - - elif isinstance(date_value, (date, datetime)): - value = date_value.strftime('%Y-%m-%dT%H:%M:%SZ') - - elif date_value in [None, 'None']: - value = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - - else: - msg = f'Unknown date string: {date_value}' - raise RuntimeError(msg) - - return value diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py index 764a4ed..1189c6b 100644 --- a/pygeometa/schemas/schema_org/__init__.py +++ b/pygeometa/schemas/schema_org/__init__.py @@ -43,14 +43,13 @@ # # ================================================================= -from datetime import date, datetime import json import logging import os from typing import Union from pygeometa.core import get_charstring -from pygeometa.helpers import json_dumps +from pygeometa.helpers import generate_datetime, json_dumps from pygeometa.schemas.base import BaseOutputSchema THISDIR = os.path.dirname(os.path.realpath(__file__)) @@ -123,7 +122,8 @@ def import_(self, metadata: str) -> dict: elif geo['@type'] == 'GeoShape': mcf['spatial']['datatype'] = 'vector' mcf['spatial']['geomtype'] = 'polygon' - bbox = geo['box'].split() + bt = geo['box'].split() + bbox = bt[1], bt[0], bt[3], bt[2] mcf['identification']['extents']['spatial'].append({ 'bbox': bbox, @@ -198,8 +198,7 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: :param stringify: whether to return a string representation (default) else native (dict, etree) - - :returns: `dict` or `str` of MCF as an OARec record representation + :returns: `dict` or `str` of MCF as Schema.org """ self.lang1 = mcf['metadata'].get('language') @@ -216,34 +215,22 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: LOGGER.debug('Generating baseline record') record = { - 'id': mcf['metadata']['identifier'], - 'conformsTo': [ - 'http://www.opengis.net/spec/ogcapi-records-1/1.0/conf/record-core', # noqa - ], - 'type': 'Feature', - 'geometry': { - 'type': 'Polygon', - 'coordinates': [[ - [minx, miny], - [minx, maxy], - [maxx, maxy], - [maxx, miny], - [minx, miny] - ]] - }, - 'properties': { - 'title': title[0], - 'description': description[0], - 'themes': [], - 'type': mcf['metadata']['hierarchylevel'], - }, - 'links': [] + 'identifier': mcf['metadata']['identifier'], + '@type': dict(zip(TYPES.values(), TYPES.keys()))[mcf['metadata']['hierarchylevel']], # noqa + 'spatialCoverage': [{ + '@type': 'Place', + 'geo': { + '@type': 'GeoShape', + 'box': f'{miny},{minx} {maxy},{maxx}' + } + }], + 'title': title[0], + 'description': description[0], + 'distribution': [] } if self.lang1 is not None: - record['properties']['language'] = { - 'code': self.lang1 - } + record['inLanguage'] = self.lang1 LOGGER.debug('Checking for temporal') try: @@ -265,44 +252,22 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: elif [begin, end] == ['..', '..']: pass else: - record['time'] = { - 'interval': [begin, end] - } - - if 'resolution' in mcf['identification']['extents']['temporal'][0]: # noqa - record['time']['resolution'] = mcf['identification']['extents']['temporal'][0]['resolution'] # noqa - + record['temporalCoverage'] = [f'{begin}/{end}'] except (IndexError, KeyError): - record['time'] = None + pass LOGGER.debug('Checking for dates') - if 'dates' in mcf['identification']: - if 'creation' in mcf['identification']['dates']: - record['properties']['created'] = self.generate_date(mcf['identification']['dates']['creation']) # noqa - - if 'revision' in mcf['identification']['dates']: - record['properties']['updated'] = self.generate_date(mcf['identification']['dates']['revision']) # noqa - - rights = get_charstring(mcf['identification'].get('rights'), - self.lang1, self.lang2) - - if rights != [None, None]: - record['properties']['rights'] = rights[0] - - formats = [] - for v in mcf['distribution'].values(): - format_ = get_charstring(v.get('format'), self.lang1, self.lang2) - if format_[0] is not None: - formats.append(format_[0]) - - LOGGER.debug('Checking for formats') - if formats: - formats2 = set(formats) - record['properties']['formats'] = [{'name': f} for f in formats2] + for key, value in mcf['identification']['dates'].items(): + if key == 'creation': + record['dateCreated'] = generate_datetime(value) + elif key == 'revision': + record['dateModified'] = generate_datetime(value) + elif key == 'publication': + record['datePublished'] = generate_datetime(value) LOGGER.debug('Checking for contacts') - record['properties']['contacts'] = self.generate_contacts( + record['contacts'] = self.generate_contacts( mcf['contact']) all_keywords = [] @@ -332,13 +297,8 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: theme['scheme'] = scheme - record['properties']['themes'].append(theme) - if all_keywords: - record['properties']['keywords'] = all_keywords - - if not record['properties']['themes']: - _ = record['properties'].pop('themes', None) + record['keywords'] = all_keywords LOGGER.debug('Checking for licensing') if mcf['identification'].get('license') is not None: @@ -352,14 +312,14 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: 'title': license.get('name', 'license for this resource'), 'url': license['url'] } - record['links'].append(self.generate_link(license_link)) + record['distribution'].append(self.generate_link(license_link)) else: LOGGER.debug('Encoding license as property') - record['properties']['license'] = license['name'] + record['license'] = license['name'] LOGGER.debug('Checking for distribution') for value in mcf['distribution'].values(): - record['links'].append(self.generate_link(value)) + record['distribution'].append(self.generate_link(value)) if stringify: return json_dumps(record) @@ -453,7 +413,7 @@ def generate_party(self, contact: dict, rp['roles'].append(r) if 'url' in contact: - rp['links'] = [{ + rp['distribution'] = [{ 'rel': 'canonical', 'type': 'text/html', 'href': contact['url'] @@ -501,20 +461,18 @@ def generate_contacts(self, contact: dict) -> list: def generate_link(self, distribution: dict) -> dict: """ - Generates OARec link object from MCF distribution object + Generates Schema.org link object from MCF distribution object :param distribution: `dict` of MCF distribution - :returns: OARec link object + :returns: Schema.org link object """ - title = get_charstring(distribution.get('title'), - self.lang1, self.lang2) - - name = get_charstring(distribution.get('name'), self.lang1, self.lang2) + name = get_charstring(distribution.get('name'), + self.lang1, self.lang2) link = { - 'href': distribution['url'] + 'contentUrl': distribution['url'] } if distribution.get('type') is not None: @@ -524,59 +482,9 @@ def generate_link(self, distribution: dict) -> dict: if reltype is not None: link['rel'] = reltype - if title != [None, None]: - link['title'] = title[0] + if name != [None, None]: + link['name'] = name[0] elif name != [None, None]: - link['title'] = name[0] - - if all(x in distribution['url'] for x in ['{', '}']): - link['templated'] = True - - if 'channel' in distribution: - link['channel'] = distribution['channel'] + link['name'] = name[0] return link - - def generate_date(self, date_value: str) -> str: - """ - Helper function to derive RFC3339 date from MCF date type - - :param date_value: `str` of date value - - :returns: `str` of date-time value - """ - - value = None - - if isinstance(date_value, str) and date_value != 'None': - if len(date_value) == 10: # YYYY-MM-DD - format_ = '%Y-%m-%d' - elif len(date_value) == 7: # YYYY-MM - format_ = '%Y-%m' - elif len(date_value) == 4: # YYYY - format_ = '%Y' - elif len(date_value) == 19: # YYYY-MM-DDTHH:MM:SS - msg = 'YYYY-MM-DDTHH:MM:SS with no timezone; converting to UTC' - LOGGER.debug(msg) - format_ = '%Y-%m-%dT%H:%M:%S' - - LOGGER.debug('date type found; expanding to date-time') - value = datetime.strptime(date_value, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa - - elif isinstance(date_value, int) and len(str(date_value)) == 4: - date_value2 = str(date_value) - LOGGER.debug('date type found; expanding to date-time') - format_ = '%Y' - value = datetime.strptime(date_value2, format_).strftime('%Y-%m-%dT%H:%M:%SZ') # noqa - - elif isinstance(date_value, (date, datetime)): - value = date_value.strftime('%Y-%m-%dT%H:%M:%SZ') - - elif date_value in [None, 'None']: - value = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ') - - else: - msg = f'Unknown date string: {date_value}' - raise RuntimeError(msg) - - return value From 5c1eaecb9c9802123639d422911b768a694cb566 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Tue, 25 Mar 2025 10:33:16 -0400 Subject: [PATCH 05/10] fix schema autodetect --- pygeometa/core.py | 15 ++++++++++++--- tests/run_tests.py | 6 +++--- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/pygeometa/core.py b/pygeometa/core.py index 323c80b..57d840d 100644 --- a/pygeometa/core.py +++ b/pygeometa/core.py @@ -339,6 +339,9 @@ def import_metadata(schema: str, metadata: str) -> dict: :returns: MCF object """ + content = None + error_message = None + if schema == 'autodetect': schemas = get_supported_schemas() else: @@ -349,11 +352,17 @@ def import_metadata(schema: str, metadata: str) -> dict: schema_object = load_schema(s) try: - return schema_object.import_(metadata) + content = schema_object.import_(metadata) + break except NotImplementedError: - raise RuntimeError(f'Import not supported for {s}') + error_message = f'Import not supported for {s}' except Exception as err: - raise RuntimeError(f'Import failed: {err}') + error_message = f'Import failed: {err}' + + if error_message is not None: + LOGGER.warning(error_message) + + return content def transform_metadata(input_schema: str, output_schema: str, diff --git a/tests/run_tests.py b/tests/run_tests.py index 3ceb6c0..4c89283 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -226,17 +226,17 @@ def test_get_supported_schemas(self): schemas = sorted(get_supported_schemas()) self.assertIsInstance(schemas, list, 'Expected list') - self.assertEqual(len(schemas), 11, + self.assertEqual(len(schemas), 10, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), sorted(['cwl', 'dcat', 'iso19139', 'iso19139-2', - 'iso19139-hnap', 'oarec-record', 'schema-org', + 'iso19139-hnap', 'oarec-record', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') schemas = get_supported_schemas(include_autodetect=True) - self.assertEqual(len(schemas), 12, + self.assertEqual(len(schemas), 11, 'Expected specific number of supported schemas') self.assertIn('autodetect', schemas, 'Expected autodetect in list') From d5804894285a95cce769996e26d1030d8a8bfc6c Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Fri, 16 May 2025 09:03:44 +0200 Subject: [PATCH 06/10] fix export to schema-org (#266) * fix export to schem-org * Update __init__.py * Update __init__.py * Update __init__.py --------- Co-authored-by: Tom Kralidis --- pygeometa/schemas/schema_org/__init__.py | 200 +++++++++++++---------- 1 file changed, 117 insertions(+), 83 deletions(-) diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py index 1189c6b..94d0094 100644 --- a/pygeometa/schemas/schema_org/__init__.py +++ b/pygeometa/schemas/schema_org/__init__.py @@ -56,6 +56,21 @@ LOGGER = logging.getLogger(__name__) +CONTACTS = [ + 'accountablePerson', + 'author', + 'contributor', + 'copyrightHolder', + 'creator', + 'editor', + 'funder', + 'maintainer', + 'producer', + 'provider', + 'publisher', + 'sponsor' +] + TYPES = { 'Series': 'series', 'SoftwareApplication': 'software', @@ -216,15 +231,16 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: LOGGER.debug('Generating baseline record') record = { 'identifier': mcf['metadata']['identifier'], - '@type': dict(zip(TYPES.values(), TYPES.keys()))[mcf['metadata']['hierarchylevel']], # noqa + "@context": "http://schema.org/", + '@type': 'schema:' + dict(zip(TYPES.values(), TYPES.keys()))[mcf['metadata']['hierarchylevel']], # noqa 'spatialCoverage': [{ - '@type': 'Place', + '@type': 'schema:Place', 'geo': { - '@type': 'GeoShape', + '@type': 'schema:GeoShape', 'box': f'{miny},{minx} {maxy},{maxx}' } }], - 'title': title[0], + 'name': title[0], 'description': description[0], 'distribution': [] } @@ -234,8 +250,8 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: LOGGER.debug('Checking for temporal') try: - begin = mcf['identification']['extents']['temporal'][0]['begin'] - end = mcf['identification']['extents']['temporal'][0].get('end') + begin = mcf['identification']['extents']['temporal'][0].get('begin') # noqa + end = mcf['identification']['extents']['temporal'][0].get('end') # noqa if begin in ['now', 'None', None]: begin = '..' @@ -267,8 +283,11 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: record['datePublished'] = generate_datetime(value) LOGGER.debug('Checking for contacts') - record['contacts'] = self.generate_contacts( - mcf['contact']) + + for ct in CONTACTS: + contacts = self.generate_contacts(mcf['contact'], ct) + if contacts and len(contacts) > 0: + record[ct] = contacts all_keywords = [] @@ -306,13 +325,7 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: if 'url' in license: LOGGER.debug('Encoding license as link') - license_link = { - 'rel': 'license', - 'type': 'text/html', - 'title': license.get('name', 'license for this resource'), - 'url': license['url'] - } - record['distribution'].append(self.generate_link(license_link)) + record['license'] = license['url'] else: LOGGER.debug('Encoding license as property') record['license'] = license['name'] @@ -321,20 +334,28 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: for value in mcf['distribution'].values(): record['distribution'].append(self.generate_link(value)) + LOGGER.debug('Checking for content_info') + if mcf.get('content_info', {}): + ci = mcf['content_info'] + if ci.get('attributes', {}): + record['variableMeasured'] = self.generate_variables(ci['attributes']) # noqa + if ci.get('dimensions', {}): + record['variableMeasured'] = self.generate_variables(ci['dimensions']) # noqa + if stringify: return json_dumps(record) return record def generate_party(self, contact: dict, - lang1: str, lang2: str, roles: list) -> dict: + lang1: str, lang2: str) -> dict: """ generate party construct from MCF contact :param contact: dict of MCF contact :param self.lang1: primary language :param self.lang2: alternate language - :param roles: roles of contact + :returns: MCF contact as a party representation """ @@ -348,12 +369,6 @@ def generate_party(self, contact: dict, position_name = get_charstring(contact.get('positionname'), self.lang1, self.lang2) - hours_of_service = get_charstring(contact.get('hoursofservice'), - self.lang1, self.lang2) - - contact_instructions = get_charstring( - contact.get('contactinstructions'), self.lang1, self.lang2) - address = get_charstring(contact.get('address'), self.lang1, self.lang2) @@ -373,91 +388,107 @@ def generate_party(self, contact: dict, 'roles': [] } - if organization_name[0] is not None: - rp['organization'] = organization_name[0] if individual_name[0] is not None: + rp['@type'] = "schema:Person" rp['name'] = individual_name[0] - if position_name[0] is not None: - rp['position'] = position_name[0] - if hours_of_service[0] is not None: - rp['hoursOfService'] = hours_of_service[0] - if contact_instructions[0] is not None: - rp['contactInstructions'] = contact_instructions[0] + if position_name[0] is not None: + rp['jobTitle'] = position_name[0] + rp['affiliation'] = { + '@type': "schema:Organization", + 'name': organization_name[0] + } + else: + rp['@type'] = "schema:Organization" + rp['name'] = organization_name[0] if address[0] is not None: - rp['addresses'][0]['deliveryPoint'] = [address[0]] - if city[0] is not None: - rp['addresses'][0]['city'] = city[0] - if administrative_area[0] is not None: - rp['addresses'][0]['administrativeArea'] = administrative_area[0] - if postalcode[0] is not None: - rp['addresses'][0]['postalCode'] = postalcode[0] - if country[0] is not None: - rp['addresses'][0]['country'] = country[0] + rp['address'] = {"@type": "schema:PostalAddress"} + rp['address']['streetAddress'] = address[0] + if city[0] is not None: + rp['address']['addressLocality'] = city[0] + if administrative_area[0] is not None: + rp['address']['addressRegion'] = administrative_area[0] + if postalcode[0] is not None: + rp['address']['postalCode'] = postalcode[0] + if country[0] is not None: + rp['address']['addressCountry'] = country[0] if contact.get('phone') is not None: LOGGER.debug('Formatting phone number') phone = contact['phone'] phone = phone.replace('-', '').replace('(', '').replace(')', '') phone = phone.replace('+0', '+').replace(' ', '') - - rp['phones'] = [{'value': phone}] + rp['telephone'] = phone if contact.get('email') is not None: - rp['emails'] = [{'value': contact.get('email')}] - - if rp['addresses'][0] == {}: - rp.pop('addresses') - - for r in set(roles): - rp['roles'].append(r) + rp['email'] = contact.get('email') if 'url' in contact: - rp['distribution'] = [{ - 'rel': 'canonical', - 'type': 'text/html', - 'href': contact['url'] - }] + rp['url'] = contact['url'] return rp - def generate_contacts(self, contact: dict) -> list: + def generate_variables(self, dict_: dict) -> list: + """ + Generates 1..n variables + + :param dict_: `dict` of attributes + + :returns: `list` of variables + """ + + dict2 = [] + for d in dict_: + d2 = { + '@type': 'schema:PropertyValue', + 'name': d.get('name', ''), + 'decription': d.get('description', ''), + } + if d.get('max') is not None: + d2['maxValue'] = d['max'] + if d.get('min') is not None: + d2['minValue'] = d['min'] + if d.get('units') is not None: + d2['unitCode'] = d['unit'] + dict2.append(d2) + + return dict2 + + def generate_contacts(self, contact: dict, role: str) -> list: """ Generates 1..n contacts, streamlining identical contacts with multiple roles :param contact: `dict` of contacts + :param role: `str` of role :returns: `list` of contacts """ contacts = [] - contacts2 = [] - for key, value in contact.items(): - if contacts: - for c in contacts: - if value == c['contact']: - LOGGER.debug('Found matching contact; adding role') - c['roles'].append(key) - else: - LOGGER.debug('Adding contact') - contacts.append({ - 'contact': value, - 'roles': [key] - }) - else: - contacts.append({ - 'contact': value, - 'roles': [key] - }) + role_mcf_schema_map = { + 'accountablePerson': [], + 'author': ['originator'], + 'contributor': ['user'], + 'copyrightHolder': ['owner'], + 'creator': [], + 'editor': [], + 'funder': [], + 'maintainer': ['processor', 'custodian'], + 'producer': ['distributor', 'principalInvestigator'], + 'provider': ['resourceProvider'], + 'publisher': ['pointOfContact'], + 'sponsor': [] + } - LOGGER.debug(f'Contacts: {contacts}') - for c in contacts: - contacts2.append(self.generate_party(c['contact'], self.lang1, - self.lang2, c['roles'])) + for key, value in contact.items(): + if any([value.get('role', key) == role, + value.get('role', key) in role_mcf_schema_map[role]]): + contacts.append( + self.generate_party(value, self.lang1, self.lang2)) - return contacts2 + return contacts def generate_link(self, distribution: dict) -> dict: """ @@ -471,20 +502,23 @@ def generate_link(self, distribution: dict) -> dict: name = get_charstring(distribution.get('name'), self.lang1, self.lang2) + desc = get_charstring(distribution.get('description'), + self.lang1, self.lang2) + link = { + '@type': 'schema:DataDownload', 'contentUrl': distribution['url'] } if distribution.get('type') is not None: - link['type'] = distribution['type'] - - reltype = distribution.get('rel') or distribution.get('function') - if reltype is not None: - link['rel'] = reltype + link['encodingFormat'] = distribution['type'] if name != [None, None]: link['name'] = name[0] elif name != [None, None]: link['name'] = name[0] + if desc != [None, None]: + link['description'] = desc[0] + return link From ae62c6c802efac6f2892b96ae5dcd6d3855296e9 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Fri, 10 Oct 2025 23:55:30 -0400 Subject: [PATCH 07/10] fix ref --- pygeometa/schemas/schema_org/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py index 94d0094..f06962c 100644 --- a/pygeometa/schemas/schema_org/__init__.py +++ b/pygeometa/schemas/schema_org/__init__.py @@ -237,7 +237,7 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: '@type': 'schema:Place', 'geo': { '@type': 'schema:GeoShape', - 'box': f'{miny},{minx} {maxy},{maxx}' + 'box': f'{miny} {minx} {maxy} {maxx}' } }], 'name': title[0], @@ -384,7 +384,6 @@ def generate_party(self, contact: dict, self.lang1, self.lang2) rp = { - 'addresses': [{}], 'roles': [] } @@ -449,7 +448,7 @@ def generate_variables(self, dict_: dict) -> list: if d.get('min') is not None: d2['minValue'] = d['min'] if d.get('units') is not None: - d2['unitCode'] = d['unit'] + d2['unitCode'] = d['units'] dict2.append(d2) return dict2 From 55be74428725f20d79947ab3172f03de12bd33c5 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Fri, 10 Oct 2025 23:59:30 -0400 Subject: [PATCH 08/10] fix tests --- tests/run_tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/run_tests.py b/tests/run_tests.py index 4c89283..3ceb6c0 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -226,17 +226,17 @@ def test_get_supported_schemas(self): schemas = sorted(get_supported_schemas()) self.assertIsInstance(schemas, list, 'Expected list') - self.assertEqual(len(schemas), 10, + self.assertEqual(len(schemas), 11, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), sorted(['cwl', 'dcat', 'iso19139', 'iso19139-2', - 'iso19139-hnap', 'oarec-record', + 'iso19139-hnap', 'oarec-record', 'schema-org', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') schemas = get_supported_schemas(include_autodetect=True) - self.assertEqual(len(schemas), 11, + self.assertEqual(len(schemas), 12, 'Expected specific number of supported schemas') self.assertIn('autodetect', schemas, 'Expected autodetect in list') From 8becb0d661a41a6e5d5a6ac0fb13807515543dcf Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 11 Oct 2025 01:35:40 -0400 Subject: [PATCH 09/10] fix import/parsing --- pygeometa/schemas/schema_org/__init__.py | 45 ++++++++++++++---------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py index f06962c..8df5821 100644 --- a/pygeometa/schemas/schema_org/__init__.py +++ b/pygeometa/schemas/schema_org/__init__.py @@ -146,7 +146,7 @@ def import_(self, metadata: str) -> dict: }) if 'temporalCoverage' in md: - begin, end = md['temporalCoverage'].split('/') + begin, end = md['temporalCoverage'][0].split('/') mcf['identification']['extents']['temporal'] = [{ 'begin': begin, 'end': end @@ -157,14 +157,14 @@ def import_(self, metadata: str) -> dict: mcf['identification']['abstract'] = md['description'] if 'dateCreated' in md: - mcf['metadata']['identification']['creation'] = md['datePublished'] + mcf['identification']['creation'] = md['datePublished'] if 'datePublished' in md: - mcf['metadata']['identification']['publication'] = md['datePublished'] # noqa + mcf['identification']['publication'] = md['datePublished'] # noqa if 'dateModified' in md: - mcf['metadata']['identification']['revision'] = md['dateModified'] + mcf['identification']['revision'] = md['dateModified'] if 'version' in md: - mcf['metadata']['identification']['edition'] = md['version'] + mcf['identification']['edition'] = md['version'] mcf['identification']['keywords'] = { 'default': { @@ -184,20 +184,27 @@ def import_(self, metadata: str) -> dict: for ct in ['author', 'publisher', 'creator', 'provider', 'funder']: if ct in md: contact = {} - contact['url'] = md[ct]['url'] - contact['individualname'] = md[ct]['name'] - if md[ct]['@type'] == 'Organization': - contact['organization'] = md[ct]['name'] - - if 'address' in md[ct]: - contact['address'] = md[ct]['streetAddress'] - contact['city'] = md[ct]['addressLocality'] - contact['administrativearea'] = md[ct]['addressRegion'] - contact['postalcode'] = md[ct]['postalCode'] - contact['country'] = md[ct]['addressCountry'] - - if 'contactPoint' in md[ct]: - cp = md[ct][0] + + if isinstance(md[ct], list): + ct2 = md[ct][0] + else: + ct2 = md[ct] + + if 'url' in ct2: + contact['url'] = ct2['url'] + contact['individualname'] = ct2['name'] + if ct2['@type'] == 'Organization': + contact['organization'] = ct2['name'] + + if 'address' in ct2: + contact['address'] = ct2['streetAddress'] + contact['city'] = ct2['addressLocality'] + contact['administrativearea'] = ct2['addressRegion'] + contact['postalcode'] = ct2['postalCode'] + contact['country'] = ct2['addressCountry'] + + if 'contactPoint' in ct2: + cp = ct2[0] contact['email'] = cp['email'] contact['fax'] = cp['fax'] From d8e28e4d78091a790c9835effa5849f8e085d2be Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 11 Oct 2025 02:13:07 -0400 Subject: [PATCH 10/10] update --- pygeometa/schemas/schema_org/__init__.py | 71 ++++++++++++++++-------- 1 file changed, 49 insertions(+), 22 deletions(-) diff --git a/pygeometa/schemas/schema_org/__init__.py b/pygeometa/schemas/schema_org/__init__.py index 8df5821..01efed7 100644 --- a/pygeometa/schemas/schema_org/__init__.py +++ b/pygeometa/schemas/schema_org/__init__.py @@ -111,6 +111,7 @@ def import_(self, metadata: str) -> dict: 'mcf': { 'version': '1.0', }, + 'spatial': {}, 'metadata': {}, 'identification': { 'extents': { @@ -121,14 +122,18 @@ def import_(self, metadata: str) -> dict: 'distribution': {} } - mcf['metadata']['identifier'] = md['identifier'] + id_ = md.get('identifier', md.get('@id')) + mcf['metadata']['identifier'] = id_ mcf['metadata']['charset'] = 'utf-8' mcf['metadata']['type'] = TYPES[md.get('type', 'Dataset')] mcf['metadata']['language'] = md.get('inLanguage', 'en') if 'spatialCoverage' in md or 'spatial' in md: + sc = _get_list_or_dict(md['spatialCoverage']) crs = 4326 - geo = md['spatialCoverage']['geo'] + + geo = _get_list_or_dict(sc['geo']) + if geo['@type'] == 'GeoCoordinates': mcf['spatial']['datatype'] = 'vector' mcf['spatial']['geomtype'] = 'point' @@ -185,30 +190,30 @@ def import_(self, metadata: str) -> dict: if ct in md: contact = {} - if isinstance(md[ct], list): - ct2 = md[ct][0] - else: - ct2 = md[ct] + ct2 = _get_list_or_dict(md[ct]) + + if ct2: + contact['individualname'] = ct2['name'] - if 'url' in ct2: - contact['url'] = ct2['url'] - contact['individualname'] = ct2['name'] - if ct2['@type'] == 'Organization': - contact['organization'] = ct2['name'] + if 'url' in ct2: + contact['url'] = ct2['url'] - if 'address' in ct2: - contact['address'] = ct2['streetAddress'] - contact['city'] = ct2['addressLocality'] - contact['administrativearea'] = ct2['addressRegion'] - contact['postalcode'] = ct2['postalCode'] - contact['country'] = ct2['addressCountry'] + if ct2['@type'] == 'Organization': + contact['organization'] = ct2['name'] - if 'contactPoint' in ct2: - cp = ct2[0] - contact['email'] = cp['email'] - contact['fax'] = cp['fax'] + if 'address' in ct2: + contact['address'] = ct2['streetAddress'] + contact['city'] = ct2['addressLocality'] + contact['administrativearea'] = ct2['addressRegion'] + contact['postalcode'] = ct2['postalCode'] + contact['country'] = ct2['addressCountry'] - mcf['contact'][ct] = contact + if 'contactPoint' in ct2: + cp = _get_list_or_dict(ct2['contactPoint']) + contact['email'] = cp.get('email') + contact['fax'] = cp.get('fax') + + mcf['contact'][ct] = contact return mcf @@ -528,3 +533,25 @@ def generate_link(self, distribution: dict) -> dict: link['description'] = desc[0] return link + + +def _get_list_or_dict(value: Union[None, list, dict]) -> Union[None, dict]: + """ + Helper function to determine whether an element is a list, object or `None` + + :param value: value to evaluate + + :returns: `dict` or None + """ + + if value is None: + return None + + if isinstance(value, list): + if len(value) == 0: + return None + else: + return value[0] + + else: + return value