Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
The MIT License (MIT)

Copyright (c) 2015 Stephen Lumenta
Copyright (c) 2015 Stephen Lumenta, and 2017 Elliott Stoneham

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
166 changes: 166 additions & 0 deletions brd.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
package ner

/*
#include <stdlib.h>
#include <stdio.h>
#include "mitie.h"
*/
import "C"
import (
"errors"
"io/ioutil"
"path/filepath"
"unsafe"
)

var (
// ErrIncompatibleNER occurs when an incompatible NER object is used with the relation detector.
ErrIncompatibleNER = errors.New("an incompatible NER object was used with the relation detector")
)

// Detector of binary relationships.
type Detector struct {
brd *C.mitie_binary_relation_detector
}

// NewDetector returns a Detector given the path to a binary relationship detection model.
func NewDetector(path string) (*Detector, error) {
model := C.CString(path)
defer C.free(unsafe.Pointer(model))
brd := C.mitie_load_binary_relation_detector(model)
if brd == nil {
return nil, ErrCantOpen
}

return &Detector{
brd: brd,
}, nil
}

// Free frees the underlying used C memory.
func (det *Detector) Free() {
C.mitie_free(unsafe.Pointer(det.brd))
}

// String gives the name of the detector.
func (det *Detector) String() string {
return C.GoString(C.mitie_binary_relation_detector_name_string(det.brd))
}

// AllDetectorsFromDir a directory can be loaded using this utility.
func AllDetectorsFromDir(svmModelDir string) (detectors []*Detector, err error) {

files, err := ioutil.ReadDir(svmModelDir)
if err != nil {
return nil, err
}
for _, fi := range files {
if filepath.Ext(fi.Name()) == ".svm" {
svmPath := svmModelDir + string(filepath.Separator) + fi.Name()
det, err := NewDetector(svmPath)
if err != nil {
return nil, err
}
detectors = append(detectors, det)
}
}

return detectors, nil
}

type Relation struct {
Relationship string
From, To Range
Score float64
}

// Detect binary relationships in the results of a previous extraction.
func (extn *Extraction) Detect(detectors []*Detector) ([]Relation, error) {

ret := []Relation{}

// Now let's scan along the entities and ask the relation detector which pairs of
// entities are instances of the type of relation we are looking for.
for i := 0; i+1 < extn.numDets; i++ {
rels, err := detectRelation(detectors, extn.extractor.ner, extn.ctokens, extn.dets, C.ulong(i), C.ulong(i+1))
if err != nil {
return nil, err
}
ret = append(ret, rels...)

// Relations have an ordering to their arguments. So even if the above
// relation check failed we still might have a valid relation if we try
// swapping the two arguments. So that's what we do here.
rels, err = detectRelation(detectors, extn.extractor.ner, extn.ctokens, extn.dets, C.ulong(i+1), C.ulong(i))
if err != nil {
return nil, err
}
ret = append(ret, rels...)
}

return ret, nil
}

// detectRelation logic copied from MITIE C example
func detectRelation(
detectors []*Detector,
ner *C.mitie_named_entity_extractor,
tokens **C.char,
dets *C.mitie_named_entity_detections,
idx1 C.ulong,
idx2 C.ulong,
) (rels []Relation, err error) {
idx1pos := C.mitie_ner_get_detection_position(dets, idx1)
idx1len := C.mitie_ner_get_detection_length(dets, idx1)
idx2pos := C.mitie_ner_get_detection_position(dets, idx2)
idx2len := C.mitie_ner_get_detection_length(dets, idx2)

if C.mitie_entities_overlap(idx1pos, idx1len, idx2pos, idx2len) != 0 {
return nil, nil
}

// The relation detection process in MITIE has two steps. First you extract a set of
// "features" that describe a particular relation mention. Then you call
// mitie_classify_binary_relation() on those features and see if it is an instance of a
// particular kind of relation. The reason we have this two step process is because,
// in many applications, you will have a large set of relation detectors you need to
// evaluate for each possible relation instance and it is more efficient to perform the
// feature extraction once and then reuse the results for multiple calls to
// mitie_classify_binary_relation(). However, in this case, we are simply running one
// type of relation detector.
relation := C.mitie_extract_binary_relation(ner, tokens, idx1pos, idx1len, idx2pos, idx2len)
if relation == nil {
return nil, ErrMemory
}
defer C.mitie_free(unsafe.Pointer(relation))

for _, detector := range detectors {
var score C.double

// Calling this function runs the relation detector on the relation and stores the
// output into score. If score is > 0 then the detector is indicating that this
// relation mention is an example of the type of relation this detector is looking for.
// Moreover, the larger score the more confident the detector is that it is that this
// is a correct relation detection.
if C.mitie_classify_binary_relation(detector.brd, relation, &score) != 0 {
// When you train a relation detector it uses features derived from a MITIE NER
// object as part of its processing. This is also evident in the interface of
// mitie_extract_binary_relation() which requires a NER object to perform feature
// extraction. Because of this, every relation detector depends on a NER object
// and, moreover, it is important that you use the same NER object which was used
// during training when you run the relation detector. If you don't use the same
// NER object instance the mitie_classify_binary_relation() routine will return an
// error.
return nil, ErrIncompatibleNER
}
if float64(score) > 0 {
rels = append(rels, Relation{
Relationship: detector.String(),
From: Range{Start: int(idx1pos), End: int(idx1len) + int(idx1pos)},
To: Range{Start: int(idx2pos), End: int(idx2len) + int(idx2pos)},
Score: float64(score),
})
}
}
return rels, nil
}
112 changes: 86 additions & 26 deletions ner.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,29 @@ func Tokenize(text string) []string {
return tokens
}

// TokenizeWithOffests is identical to calling Tokenize(text)
// but it also outputs the positions of each token within the input text data.
func TokenizeWithOffsets(text string) ([]string, []uint32) {
cs := C.CString(text)
defer C.free(unsafe.Pointer(cs))
var cOffsets *C.ulong
defer C.free(unsafe.Pointer(cOffsets))
ctokens := C.mitie_tokenize_with_offsets(cs, &cOffsets)
defer C.mitie_free(unsafe.Pointer(ctokens))
i := 0
// a hack since mitie arrays are NULL terminated.
p := (*[1 << 30]*C.char)(unsafe.Pointer(ctokens))
q := (*[1 << 30]C.ulong)(unsafe.Pointer(cOffsets))
tokens := make([]string, 0, 20)
offsets := make([]uint32, 0, 20)
for p[i] != nil {
tokens = append(tokens, C.GoString(p[i]))
offsets = append(offsets, uint32(q[i]))
i++
}
return tokens, offsets
}

// Range specifies the position of an Entity within a token slice.
type Range struct {
Start int
Expand All @@ -72,7 +95,8 @@ type Entity struct {

// Extractor detects entities based on a language model file.
type Extractor struct {
ner *C.mitie_named_entity_extractor
ner *C.mitie_named_entity_extractor
tags []string // E.g. PERSON or LOCATION, etc…
}

// NewExtractor returns an Extractor given the path to a language model.
Expand All @@ -84,8 +108,15 @@ func NewExtractor(path string) (*Extractor, error) {
return nil, ErrCantOpen
}

num := int(C.mitie_get_num_possible_ner_tags(ner))
tags := make([]string, num, num)
for i := 0; i < num; i++ {
tags[i] = C.GoString(C.mitie_get_named_entity_tagstr(ner, C.ulong(i)))
}

return &Extractor{
ner: ner,
ner: ner,
tags: tags,
}, nil
}

Expand All @@ -97,47 +128,76 @@ func (ext *Extractor) Free() {
// Tags returns a slice of Tags that are part of this language model.
// E.g. PERSON or LOCATION, etc…
func (ext *Extractor) Tags() []string {
num := int(C.mitie_get_num_possible_ner_tags(ext.ner))
tags := make([]string, num, num)
for i := 0; i < num; i++ {
tags[i] = ext.tagString(i)
}
return tags
return ext.tags
}

func (ext *Extractor) tagString(index int) string {
return C.GoString(C.mitie_get_named_entity_tagstr(ext.ner, C.ulong(index)))
}

// Extract runs the extractor and returns a slice of Entities found in the
// given tokens.
// given tokens. It is a convenience function.
func (ext *Extractor) Extract(tokens []string) ([]Entity, error) {
ctokens := C.ner_arr_make(C.int(len(tokens)) + 1) // NULL termination
defer C.ner_arr_free(ctokens, C.int(len(tokens))+1)
extraction, err := ext.NewExtraction(tokens)
if err != nil {
return nil, err
}
defer extraction.Free()
return extraction.Entities, nil
}

// Extraction describes the result of an extract run.
type Extraction struct {
Tokens []string
Entities []Entity
extractor *Extractor
ctokens **C.char
dets *C.struct_mitie_named_entity_detections
numDets int
}

// NewExtraction completes an extraction task and returns the extraction results for future use in relationship extraction.
func (ext *Extractor) NewExtraction(tokens []string) (*Extraction, error) {
extn := &Extraction{
extractor: ext,
Tokens: tokens,
}
extn.ctokens = C.ner_arr_make(C.int(len(tokens)) + 1) // NULL termination
for i, t := range tokens {
cs := C.CString(t) // released by ner_arr_free
C.ner_arr_set(ctokens, cs, C.int(i))
C.ner_arr_set(extn.ctokens, cs, C.int(i))
}

dets := C.mitie_extract_entities(ext.ner, ctokens)
defer C.mitie_free(unsafe.Pointer(dets))
if dets == nil {
extn.dets = C.mitie_extract_entities(ext.ner, extn.ctokens)
if extn.dets == nil {
C.ner_arr_free(extn.ctokens, C.int(len(extn.Tokens))+1)
return nil, ErrMemory
}

n := int(C.mitie_ner_get_num_detections(dets))
entities := make([]Entity, n, n)
extn.numDets = int(C.mitie_ner_get_num_detections(extn.dets))

extn.Entities = make([]Entity, extn.numDets, extn.numDets)

for i := 0; i < n; i++ {
pos := int(C.mitie_ner_get_detection_position(dets, C.ulong(i)))
len := int(C.mitie_ner_get_detection_length(dets, C.ulong(i)))
tagNames := ext.Tags()
for i := 0; i < extn.numDets; i++ {
pos := int(C.mitie_ner_get_detection_position(extn.dets, C.ulong(i)))
len := int(C.mitie_ner_get_detection_length(extn.dets, C.ulong(i)))
tagID := int(C.mitie_ner_get_detection_tag(extn.dets, C.ulong(i)))

entities[i] = Entity{
Tag: int(C.mitie_ner_get_detection_tag(dets, C.ulong(i))),
Score: float64(C.mitie_ner_get_detection_score(dets, C.ulong(i))),
Name: strings.Join(tokens[pos:pos+len], " "),
Range: Range{pos, pos + len},
extn.Entities[i] = Entity{
Tag: tagID,
TagString: tagNames[tagID],
Score: float64(C.mitie_ner_get_detection_score(extn.dets, C.ulong(i))),
Name: strings.Join(extn.Tokens[pos:pos+len], " "),
Range: Range{pos, pos + len},
}
}
return entities, nil

return extn, nil
}

// Free the C mamory used by the extraction.
func (extn *Extraction) Free() {
C.ner_arr_free(extn.ctokens, C.int(len(extn.Tokens))+1)
C.mitie_free(unsafe.Pointer(extn.dets))
}