diff --git a/LICENSE b/LICENSE index ad7d05f..d9a6b0f 100755 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2015 Stephen Lumenta +Copyright (c) 2015 Stephen Lumenta, and 2017 Elliott Stoneham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/brd.go b/brd.go new file mode 100644 index 0000000..8f973d4 --- /dev/null +++ b/brd.go @@ -0,0 +1,166 @@ +package ner + +/* +#include +#include +#include "mitie.h" +*/ +import "C" +import ( + "errors" + "io/ioutil" + "path/filepath" + "unsafe" +) + +var ( + // ErrIncompatibleNER occurs when an incompatible NER object is used with the relation detector. + ErrIncompatibleNER = errors.New("an incompatible NER object was used with the relation detector") +) + +// Detector of binary relationships. +type Detector struct { + brd *C.mitie_binary_relation_detector +} + +// NewDetector returns a Detector given the path to a binary relationship detection model. +func NewDetector(path string) (*Detector, error) { + model := C.CString(path) + defer C.free(unsafe.Pointer(model)) + brd := C.mitie_load_binary_relation_detector(model) + if brd == nil { + return nil, ErrCantOpen + } + + return &Detector{ + brd: brd, + }, nil +} + +// Free frees the underlying used C memory. +func (det *Detector) Free() { + C.mitie_free(unsafe.Pointer(det.brd)) +} + +// String gives the name of the detector. +func (det *Detector) String() string { + return C.GoString(C.mitie_binary_relation_detector_name_string(det.brd)) +} + +// AllDetectorsFromDir a directory can be loaded using this utility. +func AllDetectorsFromDir(svmModelDir string) (detectors []*Detector, err error) { + + files, err := ioutil.ReadDir(svmModelDir) + if err != nil { + return nil, err + } + for _, fi := range files { + if filepath.Ext(fi.Name()) == ".svm" { + svmPath := svmModelDir + string(filepath.Separator) + fi.Name() + det, err := NewDetector(svmPath) + if err != nil { + return nil, err + } + detectors = append(detectors, det) + } + } + + return detectors, nil +} + +type Relation struct { + Relationship string + From, To Range + Score float64 +} + +// Detect binary relationships in the results of a previous extraction. +func (extn *Extraction) Detect(detectors []*Detector) ([]Relation, error) { + + ret := []Relation{} + + // Now let's scan along the entities and ask the relation detector which pairs of + // entities are instances of the type of relation we are looking for. + for i := 0; i+1 < extn.numDets; i++ { + rels, err := detectRelation(detectors, extn.extractor.ner, extn.ctokens, extn.dets, C.ulong(i), C.ulong(i+1)) + if err != nil { + return nil, err + } + ret = append(ret, rels...) + + // Relations have an ordering to their arguments. So even if the above + // relation check failed we still might have a valid relation if we try + // swapping the two arguments. So that's what we do here. + rels, err = detectRelation(detectors, extn.extractor.ner, extn.ctokens, extn.dets, C.ulong(i+1), C.ulong(i)) + if err != nil { + return nil, err + } + ret = append(ret, rels...) + } + + return ret, nil +} + +// detectRelation logic copied from MITIE C example +func detectRelation( + detectors []*Detector, + ner *C.mitie_named_entity_extractor, + tokens **C.char, + dets *C.mitie_named_entity_detections, + idx1 C.ulong, + idx2 C.ulong, +) (rels []Relation, err error) { + idx1pos := C.mitie_ner_get_detection_position(dets, idx1) + idx1len := C.mitie_ner_get_detection_length(dets, idx1) + idx2pos := C.mitie_ner_get_detection_position(dets, idx2) + idx2len := C.mitie_ner_get_detection_length(dets, idx2) + + if C.mitie_entities_overlap(idx1pos, idx1len, idx2pos, idx2len) != 0 { + return nil, nil + } + + // The relation detection process in MITIE has two steps. First you extract a set of + // "features" that describe a particular relation mention. Then you call + // mitie_classify_binary_relation() on those features and see if it is an instance of a + // particular kind of relation. The reason we have this two step process is because, + // in many applications, you will have a large set of relation detectors you need to + // evaluate for each possible relation instance and it is more efficient to perform the + // feature extraction once and then reuse the results for multiple calls to + // mitie_classify_binary_relation(). However, in this case, we are simply running one + // type of relation detector. + relation := C.mitie_extract_binary_relation(ner, tokens, idx1pos, idx1len, idx2pos, idx2len) + if relation == nil { + return nil, ErrMemory + } + defer C.mitie_free(unsafe.Pointer(relation)) + + for _, detector := range detectors { + var score C.double + + // Calling this function runs the relation detector on the relation and stores the + // output into score. If score is > 0 then the detector is indicating that this + // relation mention is an example of the type of relation this detector is looking for. + // Moreover, the larger score the more confident the detector is that it is that this + // is a correct relation detection. + if C.mitie_classify_binary_relation(detector.brd, relation, &score) != 0 { + // When you train a relation detector it uses features derived from a MITIE NER + // object as part of its processing. This is also evident in the interface of + // mitie_extract_binary_relation() which requires a NER object to perform feature + // extraction. Because of this, every relation detector depends on a NER object + // and, moreover, it is important that you use the same NER object which was used + // during training when you run the relation detector. If you don't use the same + // NER object instance the mitie_classify_binary_relation() routine will return an + // error. + return nil, ErrIncompatibleNER + } + if float64(score) > 0 { + rels = append(rels, Relation{ + Relationship: detector.String(), + From: Range{Start: int(idx1pos), End: int(idx1len) + int(idx1pos)}, + To: Range{Start: int(idx2pos), End: int(idx2len) + int(idx2pos)}, + Score: float64(score), + }) + } + } + return rels, nil +} diff --git a/ner.go b/ner.go index b5c2574..7dcd7ac 100644 --- a/ner.go +++ b/ner.go @@ -55,6 +55,29 @@ func Tokenize(text string) []string { return tokens } +// TokenizeWithOffests is identical to calling Tokenize(text) +// but it also outputs the positions of each token within the input text data. +func TokenizeWithOffsets(text string) ([]string, []uint32) { + cs := C.CString(text) + defer C.free(unsafe.Pointer(cs)) + var cOffsets *C.ulong + defer C.free(unsafe.Pointer(cOffsets)) + ctokens := C.mitie_tokenize_with_offsets(cs, &cOffsets) + defer C.mitie_free(unsafe.Pointer(ctokens)) + i := 0 + // a hack since mitie arrays are NULL terminated. + p := (*[1 << 30]*C.char)(unsafe.Pointer(ctokens)) + q := (*[1 << 30]C.ulong)(unsafe.Pointer(cOffsets)) + tokens := make([]string, 0, 20) + offsets := make([]uint32, 0, 20) + for p[i] != nil { + tokens = append(tokens, C.GoString(p[i])) + offsets = append(offsets, uint32(q[i])) + i++ + } + return tokens, offsets +} + // Range specifies the position of an Entity within a token slice. type Range struct { Start int @@ -72,7 +95,8 @@ type Entity struct { // Extractor detects entities based on a language model file. type Extractor struct { - ner *C.mitie_named_entity_extractor + ner *C.mitie_named_entity_extractor + tags []string // E.g. PERSON or LOCATION, etc… } // NewExtractor returns an Extractor given the path to a language model. @@ -84,8 +108,15 @@ func NewExtractor(path string) (*Extractor, error) { return nil, ErrCantOpen } + num := int(C.mitie_get_num_possible_ner_tags(ner)) + tags := make([]string, num, num) + for i := 0; i < num; i++ { + tags[i] = C.GoString(C.mitie_get_named_entity_tagstr(ner, C.ulong(i))) + } + return &Extractor{ - ner: ner, + ner: ner, + tags: tags, }, nil } @@ -97,12 +128,7 @@ func (ext *Extractor) Free() { // Tags returns a slice of Tags that are part of this language model. // E.g. PERSON or LOCATION, etc… func (ext *Extractor) Tags() []string { - num := int(C.mitie_get_num_possible_ner_tags(ext.ner)) - tags := make([]string, num, num) - for i := 0; i < num; i++ { - tags[i] = ext.tagString(i) - } - return tags + return ext.tags } func (ext *Extractor) tagString(index int) string { @@ -110,34 +136,68 @@ func (ext *Extractor) tagString(index int) string { } // Extract runs the extractor and returns a slice of Entities found in the -// given tokens. +// given tokens. It is a convenience function. func (ext *Extractor) Extract(tokens []string) ([]Entity, error) { - ctokens := C.ner_arr_make(C.int(len(tokens)) + 1) // NULL termination - defer C.ner_arr_free(ctokens, C.int(len(tokens))+1) + extraction, err := ext.NewExtraction(tokens) + if err != nil { + return nil, err + } + defer extraction.Free() + return extraction.Entities, nil +} + +// Extraction describes the result of an extract run. +type Extraction struct { + Tokens []string + Entities []Entity + extractor *Extractor + ctokens **C.char + dets *C.struct_mitie_named_entity_detections + numDets int +} + +// NewExtraction completes an extraction task and returns the extraction results for future use in relationship extraction. +func (ext *Extractor) NewExtraction(tokens []string) (*Extraction, error) { + extn := &Extraction{ + extractor: ext, + Tokens: tokens, + } + extn.ctokens = C.ner_arr_make(C.int(len(tokens)) + 1) // NULL termination for i, t := range tokens { cs := C.CString(t) // released by ner_arr_free - C.ner_arr_set(ctokens, cs, C.int(i)) + C.ner_arr_set(extn.ctokens, cs, C.int(i)) } - dets := C.mitie_extract_entities(ext.ner, ctokens) - defer C.mitie_free(unsafe.Pointer(dets)) - if dets == nil { + extn.dets = C.mitie_extract_entities(ext.ner, extn.ctokens) + if extn.dets == nil { + C.ner_arr_free(extn.ctokens, C.int(len(extn.Tokens))+1) return nil, ErrMemory } - n := int(C.mitie_ner_get_num_detections(dets)) - entities := make([]Entity, n, n) + extn.numDets = int(C.mitie_ner_get_num_detections(extn.dets)) + + extn.Entities = make([]Entity, extn.numDets, extn.numDets) - for i := 0; i < n; i++ { - pos := int(C.mitie_ner_get_detection_position(dets, C.ulong(i))) - len := int(C.mitie_ner_get_detection_length(dets, C.ulong(i))) + tagNames := ext.Tags() + for i := 0; i < extn.numDets; i++ { + pos := int(C.mitie_ner_get_detection_position(extn.dets, C.ulong(i))) + len := int(C.mitie_ner_get_detection_length(extn.dets, C.ulong(i))) + tagID := int(C.mitie_ner_get_detection_tag(extn.dets, C.ulong(i))) - entities[i] = Entity{ - Tag: int(C.mitie_ner_get_detection_tag(dets, C.ulong(i))), - Score: float64(C.mitie_ner_get_detection_score(dets, C.ulong(i))), - Name: strings.Join(tokens[pos:pos+len], " "), - Range: Range{pos, pos + len}, + extn.Entities[i] = Entity{ + Tag: tagID, + TagString: tagNames[tagID], + Score: float64(C.mitie_ner_get_detection_score(extn.dets, C.ulong(i))), + Name: strings.Join(extn.Tokens[pos:pos+len], " "), + Range: Range{pos, pos + len}, } } - return entities, nil + + return extn, nil +} + +// Free the C mamory used by the extraction. +func (extn *Extraction) Free() { + C.ner_arr_free(extn.ctokens, C.int(len(extn.Tokens))+1) + C.mitie_free(unsafe.Pointer(extn.dets)) }