load_eac.grm | |
---|---|
groovy / gremlin script to load EAC-CPF relations into a graph database
| |
I could not figure out how to pass command line parameter to a gremlin
script, so as gross as this is we have to
| def env = System.getenv() |
directory to troll | def data_root = env['EAC_DIR'] ?: "./data" |
XTF Base URL used in inner loop to look up authorized form of name | def xtf_base = env['XTF_BASE_URL'] ?: "http://socialarchive.iath.virginia.edu/xtf/search?raw=1§ionType="
def database_path = env['GRAPH_DB'] ?: "./neo4j-db"
def graphML = env['GRAPH_ML'] ?: "./graph-snac-example.xml" |
does the input even exist ? | def dir = new File(data_root)
if (!(dir.exists()) ){ println data_root + " not found"; System.exit(1) } |
create graph | def g = new Neo4jGraph(database_path) |
we'll need this index later; autoindex incantation | indexKeys = new HashSet()
indexKeys.add("identity")
index = g.createAutomaticIndex('name-idx', Vertex.class, indexKeys) |
created manual index for "related items" api | sourceEADurlIndex = g.createManualIndex('sourceEADurlIndex', Vertex.class) |
first loop; define vertex for each name / EAC file | dir.eachFile{file->
def eac = new XmlSlurper().parse(file).declareNamespace(xlink: 'http://www.w3.org/1999/xlink') |
xpath: | def fromName = eac.cpfDescription.identity[0].nameEntry[0].part
def entityType = eac.cpfDescription.identity[0].entityType
def viafFunk = eac.control.otherRecordId.findAll{ it.@localType == 'VIAFId' }[0].text()
def viaf = viafFunk.replaceFirst(/VIAFId:/, "http://viaf.org/viaf/")
def dbpedia = eac.control.otherRecordId.findAll{ it.@localType == 'dbpedia' }[0].text()
dbpedia = dbpedia.replaceFirst(/dbpedia:/, "")
|
| def creatorOf = ''
def referencedIn = '' |
I happen to know only | eac.cpfDescription.relations.resourceRelation.findAll { it."@xlink:href" != '' }.each {
if (it."@xlink:arcrole" == "creatorOf") {
creatorOf = creatorOf + it."@xlink:href" + "\n"
}
if (it."@xlink:arcrole" == "referencedIn") {
referencedIn = referencedIn + it."@xlink:href" + "\n"
}
} |
stuff the stuff into the graph database | Vertex vertex = g.addVertex(null)
vertex["filename"] = file.getName()
vertex["identity"] = fromName as String
vertex["entityType"] = entityType as String |
there has got to be a better way? | if (creatorOf != '') {
vertex["creatorOf"] = creatorOf
creatorOf.tokenize("\n").each {
sourceEADurlIndex.put("creatorOf", it, vertex)
}
}
if (referencedIn != '') {
vertex["referencedIn"] = referencedIn
referencedIn.tokenize("\n").each {
sourceEADurlIndex.put("referencedIn", it, vertex)
}
}
if (viaf != '') { vertex["viaf"] = viaf }
if (dbpedia != '') { vertex["dbpedia"] = dbpedia as String }
print vertex["identity"]
println vertex
} |
second loop; create the edges here is where the the web is woven | dir.eachFile{file-> |
first, get then vertex for this file | def eac = new XmlSlurper().parse(file).declareNamespace(xlink: 'http://www.w3.org/1999/xlink')
def from_name = eac.cpfDescription.identity[0].nameEntry[0].part
def from_node = index.get("identity", from_name as String)>>1
def from_file = from_node.filename |
now, process all related names | eac.cpfDescription.relations.cpfRelation.each { |
parse the | String p = it.descriptiveNote.p
def recordId = p[10..p.size()-1] // so hackish |
| def crossQueryResult = new XmlSlurper().parse("${xtf_base}control&text=${recordId}")
def to_name = crossQueryResult.docHit[0].meta.identity[0]
def where = "recordId" |
no luck with | if ( to_name == '') {
crossQueryResult = new XmlSlurper().parse("${xtf_base}identity&text=${it.relationEntry}")
to_name = crossQueryResult.docHit[0].meta.identity[0]
where = "identity"
} |
get the vertex to connect to | def to_node, to_file
to_node_iterator = index.get("identity", to_name as String)
if ( to_node_iterator ) {
to_node = to_node_iterator.next()
to_file = to_node.filename
} |
we'll need to know the edge type | def arcrole = it."@xlink:arcrole" |
we we can create an edge; then do so | if ( from_node && to_node && arcrole && (from_node != to_node) ) {
def e = g.addEdge(null, from_node, to_node, arcrole as String) |
and add some properties to the edge | e["to_name"] = to_name as String
e["to_file"] = to_file as String
e["from_name"] = from_name as String
e["from_file"] = from_file as String
e["pre_merge_record_id"] = recordId as String |
no match found | } else {
print "SKIPPED"
print "${from_node}|${from_file}|${to_node}|${to_file}|${arcrole}"
}
println "\"${from_name}\" ${arcrole} \"${to_name}\"; ${recordId} ${where}"
}
}
println "compute popularity" |
pre compute popularity score (for each vertex in the graph) | for (z in g.V ) {
z.score = z.out.count()
print "${z.score}."
} |
output a graphML file | GraphMLWriter.outputGraph(g, new FileOutputStream(graphML)) |
neo4j likes to be shutdown graceful like | g.shutdown() |
gist https://gist.github.com/1593245 has samples of the output of this script | |