4343#
4444# =================================================================
4545
46- import os
47- from typing import Union
46+ import os , sys , yaml , json
47+ from typing import Dict , Any , List , Optional , Union
48+ from rdflib import Graph , URIRef , Namespace , Literal
49+ from rdflib .namespace import RDF
4850
4951from pygeometa .helpers import json_dumps
5052from pygeometa .schemas .base import BaseOutputSchema
5153
54+
55+ # Namespaces
56+ DCT = Namespace ('http://purl.org/dc/terms/' )
57+ DCAT = Namespace ('http://www.w3.org/ns/dcat#' )
58+ SKOS = Namespace ('http://www.w3.org/2004/02/skos/core#' )
59+ PROV = Namespace ('http://www.w3.org/ns/prov#' )
60+ FOAF = Namespace ('http://xmlns.com/foaf/0.1/' )
61+ ADMS = Namespace ('http://www.w3.org/ns/adms#' )
62+ LOCN = Namespace ('http://www.w3.org/ns/locn#' )
63+ VCARD = Namespace ('http://www.w3.org/2006/vcard/ns#' )
64+ OWL = Namespace ('http://www.w3.org/2002/07/owl#' )
65+ RDF = Namespace ('http://www.w3.org/1999/02/22-rdf-syntax-ns#' )
66+ RDFS = Namespace ('http://www.w3.org/2000/01/rdf-schema#' )
67+ SCHEMA = Namespace ('http://schema.org/' )
68+ TIME = Namespace ('http://www.w3.org/2006/time' )
69+ VCARD = Namespace ('http://www.w3.org/2006/vcard/ns#' )
70+ XSD = Namespace ('http://www.w3.org/2001/XMLSchema#' )
71+
72+ # Default DCAT-AP 3.1 context URL for missing JSON-LD contexts
73+ _DCAT_AP_CONTEXT_URL = "https://github.com/SEMICeu/DCAT-AP/raw/refs/heads/master/releases/3.0.1/context/dcat-ap.jsonld"
74+
75+ # default mapping aligning common DCAT/DCT terms to MCF paths
76+ DEFAULT_MAPPING = {
77+ 'dct:title' : 'identification.title' ,
78+ 'dct:description' : 'identification.abstract' ,
79+ 'dct:abstract' : 'identification.abstract' ,
80+ 'dct:subject' : 'identification.subjects' ,
81+ 'dct:temporal' : 'identification.temporal' ,
82+ 'dct:spatial' : 'identification.geographic' ,
83+ 'dct:license' : 'identification.licence' ,
84+ 'dcat:keyword' : 'identification.subjects' ,
85+ 'dct:language' : 'metadata.language' ,
86+ 'dct:modified' : 'identification.modified' ,
87+ 'dct:source' : 'identification.source' ,
88+ 'dct:accessRights' : 'identification.rights' ,
89+ 'dct:conformsTo' : 'identification.conformsto' ,
90+ 'dcat:contactPoint' : 'identification.contactpoint' ,
91+ 'dcat:endpointUrl' : 'identification.endpointurl' ,
92+ 'dct:format' : 'identification.format' ,
93+ 'dcat:landingPage' : 'identification.landingpage' ,
94+ 'dct:publisher' : 'identification.publisher' ,
95+ 'dct:creator' : 'identification.creator' ,
96+ 'dcat:distribution' : 'identification.distribution' ,
97+ 'dct:accrualPeriodicity' : 'identification.accrualPeriodicity' ,
98+ 'dcat:hasVersion' : 'identification.hasVersion' ,
99+ 'dct:identifier' : 'metadata.identifier' ,
100+ 'dcat:inSeries' : 'identification.inSeries' ,
101+ 'dct:isReferencedBy' : 'identification.isReferencedBy' ,
102+ 'dct:provenance' : 'identification.provenance' ,
103+ 'dct:relation' : 'identification.relation' ,
104+ 'dct:issued' : 'identification.issued' ,
105+ 'adms:sample' : 'identification.sample' ,
106+ 'dcat:spatialResolutionInMeters' : 'identification.spatialResolutionInMeters' ,
107+ 'dcat:temporalResolution' : 'identification.temporalResolution' ,
108+ 'dcat:theme' : 'identification.subjects' ,
109+ 'dct:type' : 'metadata.hierarchylevel' ,
110+ 'adms:versionNotes' : 'identification.versionnotes' ,
111+ 'prov:wasGeneratedBy' : 'identification.wasgeneratedby'
112+ }
113+
114+ INTL_MCF_FIELDS = ["identification.abstract" ,"identification.title" ]
115+
116+ # Parser formats to try in order
117+ _PARSER_FORMATS = [
118+ 'json-ld' ,
119+ 'xml' ,
120+ 'turtle' ,
121+ 'n3' ,
122+ 'trig' ,
123+ ]
124+
125+
52126THISDIR = os .path .dirname (os .path .realpath (__file__ ))
53127
54128
@@ -65,6 +139,193 @@ def __init__(self):
65139 description = 'DCAT'
66140 super ().__init__ ('dcat' , description , 'json' , THISDIR )
67141
142+
143+ def _inject_jsonld_context (self , content : str ) -> str :
144+ """
145+ Inject DCAT-AP context into JSON content if missing.
146+
147+
148+ Returns modified JSON string if '@context' not found.
149+ If parsing fails, returns original content.
150+ """
151+ try :
152+ data = json .loads (content )
153+ if isinstance (data , dict ) and '@context' not in data :
154+ data ['@context' ] = _DCAT_AP_CONTEXT_URL
155+ return json .dumps (data )
156+ except Exception :
157+ pass
158+ return content
159+
160+ def parse_dcat_content (self , content : str , base : Optional [str ] = None ) -> Graph :
161+ """
162+ Parse content into an rdflib.Graph, trying a set of common RDF serialisations.
163+
164+ Raises ValueError if none of the attempted formats succeed.
165+ """
166+ last_exc = None
167+
168+ # Try to detect if JSON-LD and may need context injection
169+ try :
170+ sample = content .strip ()[:100 ].lstrip ()
171+ if sample .startswith ('{' ) or sample .startswith ('[' ):
172+ content = self ._inject_jsonld_context (content )
173+ except Exception :
174+ pass
175+
176+ for fmt in _PARSER_FORMATS :
177+ try :
178+ g = Graph ()
179+ g .parse (data = content , format = fmt , publicID = base )
180+ return g
181+ except Exception as exc :
182+ last_exc = exc
183+ raise ValueError (f"Unable to parse content as a known RDF serialisation. Last error: { last_exc } " )
184+
185+
186+ def _to_uriref (self , key : str ) -> URIRef :
187+ """Convert a mapping key into a URIRef.
188+
189+ Accepts full URIs or qnames like 'dct:title' or 'dcat:keyword'.
190+ Unknown prefixes default to the DCT namespace.
191+ """
192+ if key .startswith ('http://' ) or key .startswith ('https://' ):
193+ return URIRef (key )
194+ if ':' in key :
195+ prefix , local = key .split (':' , 1 )
196+ ns = {
197+ 'dct' : DCT ,
198+ 'dcat' : DCAT ,
199+ 'skos' : SKOS ,
200+ 'prov' : PROV ,
201+ 'foaf' : FOAF ,
202+ 'adms' : ADMS ,
203+ 'locn' : LOCN ,
204+ 'vcard' : VCARD ,
205+ 'schema' : SCHEMA
206+ }.get (prefix )
207+ if ns is not None :
208+ return ns [local ]
209+ return DCT [key ]
210+
211+
212+ def _collect_literals_by_lang (self , values : List [Any ], deflang = 'eng' ) -> Dict [str , List [str ]]:
213+ """
214+ Given a list of rdflib nodes, return dict lang -> list(strings).
215+
216+ Literals without language tags go under the empty-string key ''.
217+ Non-literals (URIRefs) are converted to their string representation
218+ and put under ''.
219+ """
220+ out : Dict [str , List [str ]] = {}
221+ for v in values :
222+ if isinstance (v , Literal ):
223+ s = str (v )
224+ lang = v .language or deflang
225+ else :
226+ s = str (v )
227+ lang = deflang
228+ if s not in (None ,'' ):
229+ out .setdefault (lang , []).append (s )
230+ return out
231+
232+
233+ def _join_lang_values (self , values : List [str ]) -> str :
234+ """Join multiple values for the same language into a single scalar.
235+
236+ The join token is ' | '. This keeps values readable while producing a
237+ single scalar as required by the MCF core schema.
238+ """
239+ if not values :
240+ return ''
241+ if len (values ) == 1 :
242+ return values [0 ]
243+ return ' | ' .join (values )
244+
245+
246+ def build_mcf_dict (self , g : Graph , mapping : Dict [str , str ], dataset_uri : Optional [str ] = None ) -> Dict [str , Any ]:
247+ """
248+ Build an MCF-compatible nested dict from the provided graph.
249+
250+ :param g: rdflib.Graph containing DCAT metadata
251+ :param mapping: dict mapping source DCAT/DCT property (qname or URI) to
252+ a dot-separated MCF path (e.g. 'identification.title')
253+ :param dataset_uri: optional URI to focus extraction on a single dataset
254+ :returns: nested dict suitable for YAML serialization according to pygeometa's MCF
255+ """
256+ # Identify dataset node
257+ dataset_node = None
258+ if dataset_uri :
259+ dataset_node = URIRef (dataset_uri )
260+ else :
261+ for s , p , o in g .triples ((None , RDF .type , DCAT ['Dataset' ])):
262+ dataset_node = s
263+ break
264+ if dataset_node is None :
265+ # fallback to first subject found in the graph
266+ for s , p , o in g .triples ((None , None , None )):
267+ dataset_node = s
268+ break
269+
270+ if dataset_node is None :
271+ raise ValueError ('No dataset node found in the provided graph' )
272+
273+ mcf : Dict [str , Any ] = {}
274+
275+ for src_prop , tgt_path in mapping .items ():
276+ prop_ref = self ._to_uriref (src_prop )
277+ values = [o for o in g .objects (subject = dataset_node , predicate = prop_ref )]
278+ if not values or len (values ) == 0 :
279+ continue
280+
281+ if tgt_path in INTL_MCF_FIELDS :
282+ lang_map = self ._collect_literals_by_lang (values )
283+ # Convert lists to single scalar per language according to MCF core schema
284+ scalar_lang_map : Dict [str , str ] = {}
285+ for lang , vals in lang_map .items ():
286+ scalar_lang_map [(lang or 'eng' )] = self ._join_lang_values (vals )
287+ else :
288+ scalar_lang_map = str (g .qname (values [0 ]) if isinstance (values [0 ], URIRef ) else values [0 ])
289+
290+ # Insert into nested mcf by splitting tgt_path
291+ parts = tgt_path .split ('.' )
292+ cur = mcf
293+ for part in parts [:- 1 ]:
294+ cur = cur .setdefault (part , {})
295+ final_key = parts [- 1 ]
296+
297+ existing = cur .get (final_key )
298+ if existing is None or tgt_path not in INTL_MCF_FIELDS :
299+ # set the language-keyed scalar mapping
300+ cur [final_key ] = scalar_lang_map
301+ else :
302+ # merge: preserve existing languages and overwrite/append others
303+ for lang , val in scalar_lang_map .items ():
304+ if lang in existing and existing [lang ]:
305+ # if an existing value is present, join with the new one
306+ existing [lang ] = existing [lang ] + ' | ' + val
307+ else :
308+ existing [lang ] = val
309+ cur [final_key ] = existing
310+
311+ return mcf
312+
313+
314+ def import_ (self , metadata : str ) -> dict :
315+ """
316+ Import metadata into MCF
317+
318+ :param metadata: string of metadata content
319+
320+ :returns: `dict` of MCF content
321+ """
322+
323+ # Either xml or jsonld
324+
325+ g = self .parse_dcat_content (metadata )
326+ return self .build_mcf_dict (g , DEFAULT_MAPPING , dataset_uri = None )
327+
328+
68329 def write (self , mcf : dict , stringify : str = True ) -> Union [dict , str ]:
69330 """
70331 Write MCF to DCAT
0 commit comments