rdfizer.py

rdfizer.py
¶	#!/usr/bin/env python
¶ Snac Hacks Ed Summers Modified by Brian Tingle for the SNAC Project	""" This experimental script will convert the EAC graphml dump to rdf/xml using rdflib (which you will need to have installed). % rdfizer.py graph-snac-example.xml Which should generate: eac.rdf It uses the FOAF [1] and Arch [2] vocabularies. [1] http://xmlns.com/foaf/spec/ [2] http://gslis.simmons.edu/archival/arch """ import sys import rdflib from xml.sax import parse from xml.sax.handler import ContentHandler FOAF = rdflib.Namespace("http://xmlns.com/foaf/0.1/") ARCH = rdflib.Namespace("http://purl.org/archival/vocab/arch#") OWL = rdflib.Namespace("http://www.w3.org/2002/07/owl#") def main(graphml_file):
¶ create rdflib berkeleydb graph to populate	graph = rdflib.Graph("Sleepycat") graph.open("store", create=True)
¶ parse the graphml into the graph	handler = GraphMLHandler(graph) parse(graphml_file, handler)
¶ output it as turtle and rdf/xml	graph.bind("arch", ARCH) graph.bind("foaf", FOAF)
¶ 'n3', 'turtle', 'nt', 'pretty-xml', trix'	graph.serialize(file("eac.rdf", "w"), format="xml") graph.close()
¶ handle the graphML XML	class GraphMLHandler(ContentHandler): def __init__(self, graph): self.node = False self.edge = False self.key = None self.graph = graph def startElement(self, name, attributes): if name == 'node': self.node = dict(attributes) elif name == 'edge': self.edge = dict(attributes) elif name == 'data': self.key = attributes['key'] def endElement(self, name): if name == 'node': n = self.node s = snac_url(n['filename']) self.graph.add((s, FOAF.name, rdflib.Literal(n['identity']))) if n['entityType'] == 'person':
¶ TODO: massage heading into a real name? please don't, we put so much work putting them in inverted order :)	self.graph.add((s, rdflib.RDF.type, FOAF.Person))
¶ don't trust VIAF/dbpedia links for names that are only one word long (no `" " in`) because they are generally dubious	if " " in n['identity']: if "viaf" in n: self.graph.add((s, OWL.sameAs, n['viaf'])) if "dbpedia" in n: self.graph.add((s, OWL.sameAs, n['dbpedia'])) elif n['entityType'] == 'family': self.graph.add((s, rdflib.RDF.type, ARCH.Family)) elif n['entityType'] == 'corporateBody': self.graph.add((s, rdflib.RDF.type, FOAF.Organization))
¶ links to collections this person created	if "creatorOf" in n: for u in n['creatorOf'].replace('\n', ' ').split(' '): u = u.strip() if not u: continue
¶ http14 range needs #Collection	u = u + "#Collection" coll = rdflib.URIRef(u) self.graph.add((coll, rdflib.RDF.type, ARCH.Collection)) self.graph.add((coll, ARCH.hasProvenance, s)) self.graph.add((s, ARCH.primaryProvenanceOf, coll))
¶ links to collections this person in mentioned in	if "associatedWith" in n: for u in n['associatedWith'].replace('\n', ' ').split(' '): u = u.strip() if not u: continue u = u + "#Collection" coll = rdflib.URIRef(u) self.graph.add((coll, rdflib.RDF.type, ARCH.Collection))
¶ TODO: does this need a recriprical relationship? self.graph.add((coll, ARCH.hasProvenance, s))	self.graph.add((s, ARCH.referencedIn, coll)) print self.node['identity'] self.node = None elif name == 'edge': s = snac_url(self.edge['from_file']) o = snac_url(self.edge['to_file']) print "%s -> %s" % (s, o)
¶ TODO: make sure these exist?; pretty sure it will	if self.edge['label'] == 'correspondedWith':
¶ make it symmetrical without having to do inferencing	self.graph.add((s, ARCH['correspondedWith'], o)) self.graph.add((o, ARCH['correspondedWith'], s)) elif self.edge['label'] in ['associatedWith', 'associateWith']:
¶ done: is this an ok interpretation? yes, checked with @rubinsztajn	self.graph.add((s, ARCH['appearsWith'], o)) self.graph.add((o, ARCH['appearsWith'], s)) elif name == 'data': self.key = None def characters(self, content): if not self.key: return elif self.node: self.node[self.key] = self.node.get(self.key, "") + content elif self.edge: self.edge[self.key] = self.edge.get(self.key, "") + content
¶ address http range 14 issue with `#entity`	def snac_url(name): u = "http://socialarchive.iath.virginia.edu/xtf/view?docId=%s#entity" % name return rdflib.URIRef(u) if __name__ == "__main__": filename = sys.argv[1] main(filename)