Skip to content

Commit cca2e34

Browse files
committed
initial commit on import dcat records
draft - test still failing introduces rdflib dependency, can also conditinally import
1 parent a16e8e9 commit cca2e34

File tree

5 files changed

+925
-3
lines changed

5 files changed

+925
-3
lines changed

pygeometa/schemas/dcat/__init__.py

Lines changed: 263 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,86 @@
4343
#
4444
# =================================================================
4545

46-
import os
47-
from typing import Union
46+
import os, sys, yaml, json
47+
from typing import Dict, Any, List, Optional, Union
48+
from rdflib import Graph, URIRef, Namespace, Literal
49+
from rdflib.namespace import RDF
4850

4951
from pygeometa.helpers import json_dumps
5052
from pygeometa.schemas.base import BaseOutputSchema
5153

54+
55+
# Namespaces
56+
DCT = Namespace('http://purl.org/dc/terms/')
57+
DCAT = Namespace('http://www.w3.org/ns/dcat#')
58+
SKOS = Namespace('http://www.w3.org/2004/02/skos/core#')
59+
PROV = Namespace('http://www.w3.org/ns/prov#')
60+
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
61+
ADMS = Namespace('http://www.w3.org/ns/adms#')
62+
LOCN = Namespace('http://www.w3.org/ns/locn#')
63+
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
64+
OWL = Namespace('http://www.w3.org/2002/07/owl#')
65+
RDF = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#')
66+
RDFS = Namespace('http://www.w3.org/2000/01/rdf-schema#')
67+
SCHEMA = Namespace('http://schema.org/')
68+
TIME = Namespace('http://www.w3.org/2006/time')
69+
VCARD = Namespace('http://www.w3.org/2006/vcard/ns#')
70+
XSD = Namespace('http://www.w3.org/2001/XMLSchema#')
71+
72+
# Default DCAT-AP 3.1 context URL for missing JSON-LD contexts
73+
_DCAT_AP_CONTEXT_URL = "https://github.com/SEMICeu/DCAT-AP/raw/refs/heads/master/releases/3.0.1/context/dcat-ap.jsonld"
74+
75+
# default mapping aligning common DCAT/DCT terms to MCF paths
76+
DEFAULT_MAPPING = {
77+
'dct:title': 'identification.title',
78+
'dct:description': 'identification.abstract',
79+
'dct:abstract': 'identification.abstract',
80+
'dct:subject': 'identification.subjects',
81+
'dct:temporal': 'identification.temporal',
82+
'dct:spatial': 'identification.geographic',
83+
'dct:license': 'identification.licence',
84+
'dcat:keyword': 'identification.subjects',
85+
'dct:language': 'metadata.language',
86+
'dct:modified': 'identification.modified',
87+
'dct:source': 'identification.source',
88+
'dct:accessRights': 'identification.rights',
89+
'dct:conformsTo': 'identification.conformsto',
90+
'dcat:contactPoint': 'identification.contactpoint',
91+
'dcat:endpointUrl': 'identification.endpointurl',
92+
'dct:format': 'identification.format',
93+
'dcat:landingPage': 'identification.landingpage',
94+
'dct:publisher': 'identification.publisher',
95+
'dct:creator': 'identification.creator',
96+
'dcat:distribution': 'identification.distribution',
97+
'dct:accrualPeriodicity': 'identification.accrualPeriodicity',
98+
'dcat:hasVersion': 'identification.hasVersion',
99+
'dct:identifier': 'metadata.identifier',
100+
'dcat:inSeries': 'identification.inSeries',
101+
'dct:isReferencedBy': 'identification.isReferencedBy',
102+
'dct:provenance': 'identification.provenance',
103+
'dct:relation': 'identification.relation',
104+
'dct:issued': 'identification.issued',
105+
'adms:sample': 'identification.sample',
106+
'dcat:spatialResolutionInMeters': 'identification.spatialResolutionInMeters',
107+
'dcat:temporalResolution': 'identification.temporalResolution',
108+
'dcat:theme': 'identification.subjects',
109+
'dct:type': 'metadata.hierarchylevel',
110+
'adms:versionNotes': 'identification.versionnotes',
111+
'prov:wasGeneratedBy': 'identification.wasgeneratedby'
112+
}
113+
114+
INTL_MCF_FIELDS = ["identification.abstract","identification.title"]
115+
116+
# Parser formats to try in order
117+
_PARSER_FORMATS = [
118+
'json-ld',
119+
'xml',
120+
'turtle',
121+
'n3',
122+
'trig',
123+
]
124+
125+
52126
THISDIR = os.path.dirname(os.path.realpath(__file__))
53127

54128

@@ -65,6 +139,193 @@ def __init__(self):
65139
description = 'DCAT'
66140
super().__init__('dcat', description, 'json', THISDIR)
67141

142+
143+
def _inject_jsonld_context(self, content: str) -> str:
144+
"""
145+
Inject DCAT-AP context into JSON content if missing.
146+
147+
148+
Returns modified JSON string if '@context' not found.
149+
If parsing fails, returns original content.
150+
"""
151+
try:
152+
data = json.loads(content)
153+
if isinstance(data, dict) and '@context' not in data:
154+
data['@context'] = _DCAT_AP_CONTEXT_URL
155+
return json.dumps(data)
156+
except Exception:
157+
pass
158+
return content
159+
160+
def parse_dcat_content(self, content: str, base: Optional[str] = None) -> Graph:
161+
"""
162+
Parse content into an rdflib.Graph, trying a set of common RDF serialisations.
163+
164+
Raises ValueError if none of the attempted formats succeed.
165+
"""
166+
last_exc = None
167+
168+
# Try to detect if JSON-LD and may need context injection
169+
try:
170+
sample = content.strip()[:100].lstrip()
171+
if sample.startswith('{') or sample.startswith('['):
172+
content = self._inject_jsonld_context(content)
173+
except Exception:
174+
pass
175+
176+
for fmt in _PARSER_FORMATS:
177+
try:
178+
g = Graph()
179+
g.parse(data=content, format=fmt, publicID=base)
180+
return g
181+
except Exception as exc:
182+
last_exc = exc
183+
raise ValueError(f"Unable to parse content as a known RDF serialisation. Last error: {last_exc}")
184+
185+
186+
def _to_uriref(self, key: str) -> URIRef:
187+
"""Convert a mapping key into a URIRef.
188+
189+
Accepts full URIs or qnames like 'dct:title' or 'dcat:keyword'.
190+
Unknown prefixes default to the DCT namespace.
191+
"""
192+
if key.startswith('http://') or key.startswith('https://'):
193+
return URIRef(key)
194+
if ':' in key:
195+
prefix, local = key.split(':', 1)
196+
ns = {
197+
'dct': DCT,
198+
'dcat': DCAT,
199+
'skos': SKOS,
200+
'prov': PROV,
201+
'foaf': FOAF,
202+
'adms': ADMS,
203+
'locn': LOCN,
204+
'vcard': VCARD,
205+
'schema': SCHEMA
206+
}.get(prefix)
207+
if ns is not None:
208+
return ns[local]
209+
return DCT[key]
210+
211+
212+
def _collect_literals_by_lang(self, values: List[Any], deflang='eng') -> Dict[str, List[str]]:
213+
"""
214+
Given a list of rdflib nodes, return dict lang -> list(strings).
215+
216+
Literals without language tags go under the empty-string key ''.
217+
Non-literals (URIRefs) are converted to their string representation
218+
and put under ''.
219+
"""
220+
out: Dict[str, List[str]] = {}
221+
for v in values:
222+
if isinstance(v, Literal):
223+
s = str(v)
224+
lang = v.language or deflang
225+
else:
226+
s = str(v)
227+
lang = deflang
228+
if s not in (None,''):
229+
out.setdefault(lang, []).append(s)
230+
return out
231+
232+
233+
def _join_lang_values(self, values: List[str]) -> str:
234+
"""Join multiple values for the same language into a single scalar.
235+
236+
The join token is ' | '. This keeps values readable while producing a
237+
single scalar as required by the MCF core schema.
238+
"""
239+
if not values:
240+
return ''
241+
if len(values) == 1:
242+
return values[0]
243+
return ' | '.join(values)
244+
245+
246+
def build_mcf_dict(self, g: Graph, mapping: Dict[str, str], dataset_uri: Optional[str] = None) -> Dict[str, Any]:
247+
"""
248+
Build an MCF-compatible nested dict from the provided graph.
249+
250+
:param g: rdflib.Graph containing DCAT metadata
251+
:param mapping: dict mapping source DCAT/DCT property (qname or URI) to
252+
a dot-separated MCF path (e.g. 'identification.title')
253+
:param dataset_uri: optional URI to focus extraction on a single dataset
254+
:returns: nested dict suitable for YAML serialization according to pygeometa's MCF
255+
"""
256+
# Identify dataset node
257+
dataset_node = None
258+
if dataset_uri:
259+
dataset_node = URIRef(dataset_uri)
260+
else:
261+
for s, p, o in g.triples((None, RDF.type, DCAT['Dataset'])):
262+
dataset_node = s
263+
break
264+
if dataset_node is None:
265+
# fallback to first subject found in the graph
266+
for s, p, o in g.triples((None, None, None)):
267+
dataset_node = s
268+
break
269+
270+
if dataset_node is None:
271+
raise ValueError('No dataset node found in the provided graph')
272+
273+
mcf: Dict[str, Any] = {}
274+
275+
for src_prop, tgt_path in mapping.items():
276+
prop_ref = self._to_uriref(src_prop)
277+
values = [o for o in g.objects(subject=dataset_node, predicate=prop_ref)]
278+
if not values or len(values) == 0:
279+
continue
280+
281+
if tgt_path in INTL_MCF_FIELDS:
282+
lang_map = self._collect_literals_by_lang(values)
283+
# Convert lists to single scalar per language according to MCF core schema
284+
scalar_lang_map: Dict[str, str] = {}
285+
for lang, vals in lang_map.items():
286+
scalar_lang_map[(lang or 'eng')] = self._join_lang_values(vals)
287+
else:
288+
scalar_lang_map = str(g.qname(values[0]) if isinstance(values[0], URIRef) else values[0])
289+
290+
# Insert into nested mcf by splitting tgt_path
291+
parts = tgt_path.split('.')
292+
cur = mcf
293+
for part in parts[:-1]:
294+
cur = cur.setdefault(part, {})
295+
final_key = parts[-1]
296+
297+
existing = cur.get(final_key)
298+
if existing is None or tgt_path not in INTL_MCF_FIELDS:
299+
# set the language-keyed scalar mapping
300+
cur[final_key] = scalar_lang_map
301+
else:
302+
# merge: preserve existing languages and overwrite/append others
303+
for lang, val in scalar_lang_map.items():
304+
if lang in existing and existing[lang]:
305+
# if an existing value is present, join with the new one
306+
existing[lang] = existing[lang] + ' | ' + val
307+
else:
308+
existing[lang] = val
309+
cur[final_key] = existing
310+
311+
return mcf
312+
313+
314+
def import_(self, metadata: str) -> dict:
315+
"""
316+
Import metadata into MCF
317+
318+
:param metadata: string of metadata content
319+
320+
:returns: `dict` of MCF content
321+
"""
322+
323+
# Either xml or jsonld
324+
325+
g = self.parse_dcat_content(metadata)
326+
return self.build_mcf_dict(g, DEFAULT_MAPPING, dataset_uri=None)
327+
328+
68329
def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]:
69330
"""
70331
Write MCF to DCAT

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ jsonschema
44
lxml
55
OWSLib
66
pyyaml
7+
rdflib

0 commit comments

Comments
 (0)