> getToken
@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings)
throws IOException {
- return new DynamicSynonymTokenFilterFactory(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry());
+ return DynamicSynonymGraphTokenFilterFactory.getInstance(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry());
}
@Override
diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java
index 6edca5b..ce77f13 100644
--- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java
+++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymFilter.java
@@ -17,6 +17,7 @@
* limitations under the License.
*/
+import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -25,7 +26,6 @@
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.FST;
-import org.elasticsearch.common.logging.ESLoggerFactory;
import java.io.IOException;
import java.util.Arrays;
@@ -102,13 +102,14 @@
public final class DynamicSynonymFilter extends TokenFilter {
- public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym");
+ private final Logger logger = LogManager.getLogger("dynamic-synonym");
public static final String TYPE_SYNONYM = "SYNONYM";
private SynonymMap synonyms;
private final boolean ignoreCase;
+ private final boolean ignoreOffset;
private int rollBufferSize;
private int captureCount;
@@ -263,9 +264,12 @@ public void add(char[] output, int offset, int len, int endOffset,
* when you create the {@link SynonymMap}
*/
public DynamicSynonymFilter(TokenStream input, SynonymMap synonyms,
- boolean ignoreCase) {
+ boolean ignoreCase
+ ,boolean ignoreOffset
+ ) {
super(input);
this.ignoreCase = ignoreCase;
+ this.ignoreOffset = ignoreOffset;
update(synonyms);
}
@@ -589,7 +593,11 @@ public boolean incrementToken() throws IOException {
if (endOffset == -1) {
endOffset = input.endOffset;
}
- offsetAtt.setOffset(input.startOffset, endOffset);
+ if(!ignoreOffset){
+ offsetAtt.setOffset(input.startOffset, endOffset);
+ }else{
+ logger.info("dynamic synonym offset ignored");
+ }
posIncrAtt.setPositionIncrement(posIncr);
posLenAtt.setPositionLength(outputs.getLastPosLength());
if (outputs.count == 0) {
@@ -623,7 +631,11 @@ public boolean incrementToken() throws IOException {
}
clearAttributes();
// Keep offset from last input token:
- offsetAtt.setOffset(lastStartOffset, lastEndOffset);
+ if(!this.ignoreOffset){
+ offsetAtt.setOffset(lastStartOffset, lastEndOffset);
+ }else{
+ logger.info("set offset ignored");
+ }
termAtt.copyBuffer(output.chars, output.offset,
output.length);
typeAtt.setType(TYPE_SYNONYM);
diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java
new file mode 100644
index 0000000..6a0fba2
--- /dev/null
+++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymGraphFilter.java
@@ -0,0 +1,622 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.bellszhu.elasticsearch.plugin.synonym.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.FlattenGraphFilter;
+import org.apache.lucene.analysis.synonym.SynonymFilter;
+import org.apache.lucene.analysis.synonym.SynonymMap;
+import org.apache.lucene.analysis.tokenattributes.*;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
+import org.apache.lucene.util.RollingBuffer;
+import org.apache.lucene.util.fst.FST;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+
+// TODO: maybe we should resolve token -> wordID then run
+// FST on wordIDs, for better perf?
+
+// TODO: a more efficient approach would be Aho/Corasick's
+// algorithm
+// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
+// It improves over the current approach here
+// because it does not fully re-start matching at every
+// token. For example if one pattern is "a b c x"
+// and another is "b c d" and the input is "a b c d", on
+// trying to parse "a b c x" but failing when you got to x,
+// rather than starting over again your really should
+// immediately recognize that "b c d" matches at the next
+// input. I suspect this won't matter that much in
+// practice, but it's possible on some set of synonyms it
+// will. We'd have to modify Aho/Corasick to enforce our
+// conflict resolving (eg greedy matching) because that algo
+// finds all matches. This really amounts to adding a .*
+// closure to the FST and then determinizing it.
+//
+// Another possible solution is described at http://www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+
+/** Applies single- or multi-token synonyms from a {@link SynonymMap}
+ * to an incoming {@link TokenStream}, producing a fully correct graph
+ * output. This is a replacement for {@link SynonymFilter}, which produces
+ * incorrect graphs for multi-token synonyms.
+ *
+ * However, if you use this during indexing, you must follow it with
+ * {@link FlattenGraphFilter} to squash tokens on top of one another
+ * like {@link SynonymFilter}, because the indexer can't directly
+ * consume a graph. To get fully correct positional queries when your
+ * synonym replacements are multiple tokens, you should instead apply
+ * synonyms using this {@code TokenFilter} at query time and translate
+ * the resulting graph to a {@code TermAutomatonQuery} e.g. using
+ * {@code TokenStreamToTermAutomatonQuery}.
+ *
+ *
NOTE: this cannot consume an incoming graph; results will
+ * be undefined.
+ *
+ * @lucene.experimental */
+
+public final class DynamicSynonymGraphFilter extends TokenFilter {
+
+ public static final String TYPE_SYNONYM = "SYNONYM";
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private SynonymMap synonyms;
+ private final boolean ignoreCase;
+
+ private FST fst;
+
+ private FST.BytesReader fstReader;
+ private FST.Arc scratchArc;
+ private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
+ private final BytesRef scratchBytes = new BytesRef();
+ private final CharsRefBuilder scratchChars = new CharsRefBuilder();
+ private final LinkedList outputBuffer = new LinkedList<>();
+
+ private int nextNodeOut;
+ private int lastNodeOut;
+ private int maxLookaheadUsed;
+
+ // For testing:
+ private int captureCount;
+
+ private boolean liveToken;
+
+ // Start/end offset of the current match:
+ private int matchStartOffset;
+ private int matchEndOffset;
+
+ // True once the input TokenStream is exhausted:
+ private boolean finished;
+
+ private int lookaheadNextRead;
+ private int lookaheadNextWrite;
+
+ private RollingBuffer lookahead = new RollingBuffer() {
+ @Override
+ protected BufferedInputToken newInstance() {
+ return new BufferedInputToken();
+ }
+ };
+
+ static class BufferedInputToken implements RollingBuffer.Resettable {
+ final CharsRefBuilder term = new CharsRefBuilder();
+ State state;
+ int startOffset = -1;
+ int endOffset = -1;
+
+ @Override
+ public void reset() {
+ state = null;
+ term.clear();
+
+ // Intentionally invalid to ferret out bugs:
+ startOffset = -1;
+ endOffset = -1;
+ }
+ }
+
+ static class BufferedOutputToken {
+ final String term;
+
+ // Non-null if this was an incoming token:
+ final State state;
+
+ final int startNode;
+ final int endNode;
+
+ public BufferedOutputToken(State state, String term, int startNode, int endNode) {
+ this.state = state;
+ this.term = term;
+ this.startNode = startNode;
+ this.endNode = endNode;
+ }
+ }
+
+ /**
+ * Apply previously built synonyms to incoming tokens.
+ * @param input input tokenstream
+ * @param synonyms synonym map
+ * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
+ * Note, if you set this to true, it's your responsibility to lowercase
+ * the input entries when you create the {@link SynonymMap}
+ */
+ public DynamicSynonymGraphFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
+ super(input);
+ this.synonyms = synonyms;
+ this.fst = synonyms.fst;
+ if (fst == null) {
+ throw new IllegalArgumentException("fst must be non-null");
+ }
+ this.fstReader = fst.getBytesReader();
+ scratchArc = new FST.Arc<>();
+ this.ignoreCase = ignoreCase;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ //System.out.println("\nS: incrToken lastNodeOut=" + lastNodeOut + " nextNodeOut=" + nextNodeOut);
+
+ assert lastNodeOut <= nextNodeOut;
+
+ if (outputBuffer.isEmpty() == false) {
+ // We still have pending outputs from a prior synonym match:
+ releaseBufferedToken();
+ //System.out.println(" syn: ret buffered=" + this);
+ assert liveToken == false;
+ return true;
+ }
+
+ // Try to parse a new synonym match at the current token:
+
+ if (parse()) {
+ // A new match was found:
+ releaseBufferedToken();
+ //System.out.println(" syn: after parse, ret buffered=" + this);
+ assert liveToken == false;
+ return true;
+ }
+
+ if (lookaheadNextRead == lookaheadNextWrite) {
+
+ // Fast path: parse pulled one token, but it didn't match
+ // the start for any synonym, so we now return it "live" w/o having
+ // cloned all of its atts:
+ if (finished) {
+ //System.out.println(" syn: ret END");
+ return false;
+ }
+
+ assert liveToken;
+ liveToken = false;
+
+ // NOTE: no need to change posInc since it's relative, i.e. whatever
+ // node our output is upto will just increase by the incoming posInc.
+ // We also don't need to change posLen, but only because we cannot
+ // consume a graph, so the incoming token can never span a future
+ // synonym match.
+
+ } else {
+ // We still have buffered lookahead tokens from a previous
+ // parse attempt that required lookahead; just replay them now:
+ //System.out.println(" restore buffer");
+ assert lookaheadNextRead < lookaheadNextWrite: "read=" + lookaheadNextRead + " write=" + lookaheadNextWrite;
+ BufferedInputToken token = lookahead.get(lookaheadNextRead);
+ lookaheadNextRead++;
+
+ restoreState(token.state);
+
+ lookahead.freeBefore(lookaheadNextRead);
+
+ //System.out.println(" after restore offset=" + offsetAtt.startOffset() + "-" + offsetAtt.endOffset());
+ assert liveToken == false;
+ }
+
+ lastNodeOut += posIncrAtt.getPositionIncrement();
+ nextNodeOut = lastNodeOut + posLenAtt.getPositionLength();
+
+ //System.out.println(" syn: ret lookahead=" + this);
+
+ return true;
+ }
+
+ private void releaseBufferedToken() throws IOException {
+ //System.out.println(" releaseBufferedToken");
+
+ BufferedOutputToken token = outputBuffer.pollFirst();
+
+ if (token.state != null) {
+ // This is an original input token (keepOrig=true case):
+ //System.out.println(" hasState");
+ restoreState(token.state);
+ //System.out.println(" startOffset=" + offsetAtt.startOffset() + " endOffset=" + offsetAtt.endOffset());
+// offsetAtt.setOffset(0,0);
+ } else {
+ clearAttributes();
+ //System.out.println(" no state");
+ termAtt.append(token.term);
+
+ // We better have a match already:
+ assert matchStartOffset != -1;
+
+// if(!this.ignoreOffset){
+ offsetAtt.setOffset(matchStartOffset, matchEndOffset);
+// }else {
+// offsetAtt.setOffset(0,0);
+// }
+ //System.out.println(" startOffset=" + matchStartOffset + " endOffset=" + matchEndOffset);
+ typeAtt.setType(TYPE_SYNONYM);
+ }
+
+ //System.out.println(" lastNodeOut=" + lastNodeOut);
+ //System.out.println(" term=" + termAtt);
+
+ posIncrAtt.setPositionIncrement(token.startNode - lastNodeOut);
+ lastNodeOut = token.startNode;
+ posLenAtt.setPositionLength(token.endNode - token.startNode);
+ }
+
+ /** Scans the next input token(s) to see if a synonym matches. Returns true
+ * if a match was found. */
+ private boolean parse() throws IOException {
+ // System.out.println(Thread.currentThread().getName() + ": S: parse: " + System.identityHashCode(this));
+
+ // Holds the longest match we've seen so far:
+ BytesRef matchOutput = null;
+ int matchInputLength = 0;
+
+ BytesRef pendingOutput = fst.outputs.getNoOutput();
+ fst.getFirstArc(scratchArc);
+
+ assert scratchArc.output == fst.outputs.getNoOutput();
+
+ // How many tokens in the current match
+ int matchLength = 0;
+ boolean doFinalCapture = false;
+
+ int lookaheadUpto = lookaheadNextRead;
+ matchStartOffset = -1;
+
+ byToken:
+ while (true) {
+ //System.out.println(" cycle lookaheadUpto=" + lookaheadUpto + " maxPos=" + lookahead.getMaxPos());
+
+ // Pull next token's chars:
+ final char[] buffer;
+ final int bufferLen;
+ final int inputEndOffset;
+
+ if (lookaheadUpto <= lookahead.getMaxPos()) {
+ // Still in our lookahead buffer
+ BufferedInputToken token = lookahead.get(lookaheadUpto);
+ lookaheadUpto++;
+ buffer = token.term.chars();
+ bufferLen = token.term.length();
+ inputEndOffset = token.endOffset;
+ //System.out.println(" use buffer now max=" + lookahead.getMaxPos());
+ if (matchStartOffset == -1) {
+ matchStartOffset = token.startOffset;
+ }
+ } else {
+
+ // We used up our lookahead buffer of input tokens
+ // -- pull next real input token:
+
+ assert finished || liveToken == false;
+
+ if (finished) {
+ //System.out.println(" break: finished");
+ break;
+ } else if (input.incrementToken()) {
+ //System.out.println(" input.incrToken");
+ liveToken = true;
+ buffer = termAtt.buffer();
+ bufferLen = termAtt.length();
+ if (matchStartOffset == -1) {
+ matchStartOffset = offsetAtt.startOffset();
+ }
+ inputEndOffset = offsetAtt.endOffset();
+
+ lookaheadUpto++;
+ } else {
+ // No more input tokens
+ finished = true;
+ //System.out.println(" break: now set finished");
+ break;
+ }
+ }
+
+ matchLength++;
+ //System.out.println(" cycle term=" + new String(buffer, 0, bufferLen));
+
+ // Run each char in this token through the FST:
+ int bufUpto = 0;
+ while (bufUpto < bufferLen) {
+ final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+ if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
+ break byToken;
+ }
+
+ // Accum the output
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ bufUpto += Character.charCount(codePoint);
+ }
+
+ assert bufUpto == bufferLen;
+
+ // OK, entire token matched; now see if this is a final
+ // state in the FST (a match):
+ if (scratchArc.isFinal()) {
+ matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+ matchInputLength = matchLength;
+ matchEndOffset = inputEndOffset;
+ //System.out.println(" ** match");
+ }
+
+ // See if the FST can continue matching (ie, needs to
+ // see the next input token):
+ if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
+ // No further rules can match here; we're done
+ // searching for matching rules starting at the
+ // current input position.
+ break;
+ } else {
+ // More matching is possible -- accum the output (if
+ // any) of the WORD_SEP arc:
+ pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+ doFinalCapture = true;
+ if (liveToken) {
+ capture();
+ }
+ }
+ }
+
+ if (doFinalCapture && liveToken && finished == false) {
+ // Must capture the final token if we captured any prior tokens:
+ capture();
+ }
+
+ if (matchOutput != null) {
+
+ if (liveToken) {
+ // Single input token synonym; we must buffer it now:
+ capture();
+ }
+
+ // There is a match!
+ bufferOutputTokens(matchOutput, matchInputLength);
+ lookaheadNextRead += matchInputLength;
+ //System.out.println(" precmatch; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
+ lookahead.freeBefore(lookaheadNextRead);
+ //System.out.println(" match; set lookaheadNextRead=" + lookaheadNextRead + " now max=" + lookahead.getMaxPos());
+ return true;
+ } else {
+ //System.out.println(" no match; lookaheadNextRead=" + lookaheadNextRead);
+ return false;
+ }
+
+ //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
+ }
+
+ /** Expands the output graph into the necessary tokens, adding
+ * synonyms as side paths parallel to the input tokens, and
+ * buffers them in the output token buffer. */
+ private void bufferOutputTokens(BytesRef bytes, int matchInputLength) {
+ bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
+
+ final int code = bytesReader.readVInt();
+ final boolean keepOrig = (code & 0x1) == 0;
+ //System.out.println(" buffer: keepOrig=" + keepOrig + " matchInputLength=" + matchInputLength);
+
+ // How many nodes along all paths; we need this to assign the
+ // node ID for the final end node where all paths merge back:
+ int totalPathNodes;
+ if (keepOrig) {
+ assert matchInputLength > 0;
+ totalPathNodes = matchInputLength - 1;
+ } else {
+ totalPathNodes = 0;
+ }
+
+ // How many synonyms we will insert over this match:
+ final int count = code >>> 1;
+
+ // TODO: we could encode this instead into the FST:
+
+ // 1st pass: count how many new nodes we need
+ List> paths = new ArrayList<>();
+ for(int outputIDX=0;outputIDX path = new ArrayList<>();
+ paths.add(path);
+ int chEnd = scratchChars.length();
+ for(int chUpto=0; chUpto<=chEnd; chUpto++) {
+ if (chUpto == chEnd || scratchChars.charAt(chUpto) == SynonymMap.WORD_SEPARATOR) {
+ path.add(new String(scratchChars.chars(), lastStart, chUpto - lastStart));
+ lastStart = 1 + chUpto;
+ }
+ }
+
+ assert path.size() > 0;
+ totalPathNodes += path.size() - 1;
+ }
+ //System.out.println(" totalPathNodes=" + totalPathNodes);
+
+ // 2nd pass: buffer tokens for the graph fragment
+
+ // NOTE: totalPathNodes will be 0 in the case where the matched
+ // input is a single token and all outputs are also a single token
+
+ // We "spawn" a side-path for each of the outputs for this matched
+ // synonym, all ending back at this end node:
+
+ int startNode = nextNodeOut;
+
+ int endNode = startNode + totalPathNodes + 1;
+ //System.out.println(" " + paths.size() + " new side-paths");
+
+ // First, fanout all tokens departing start node for these new side paths:
+ int newNodeCount = 0;
+ for(List path : paths) {
+ int pathEndNode;
+ //System.out.println(" path size=" + path.size());
+ if (path.size() == 1) {
+ // Single token output, so there are no intermediate nodes:
+ pathEndNode = endNode;
+ } else {
+ pathEndNode = nextNodeOut + newNodeCount + 1;
+ newNodeCount += path.size() - 1;
+ }
+ outputBuffer.add(new BufferedOutputToken(null, path.get(0), startNode, pathEndNode));
+ }
+
+ // We must do the original tokens last, else the offsets "go backwards":
+ if (keepOrig) {
+ BufferedInputToken token = lookahead.get(lookaheadNextRead);
+ int inputEndNode;
+ if (matchInputLength == 1) {
+ // Single token matched input, so there are no intermediate nodes:
+ inputEndNode = endNode;
+ } else {
+ inputEndNode = nextNodeOut + newNodeCount + 1;
+ }
+
+ //System.out.println(" keepOrig first token: " + token.term);
+
+ outputBuffer.add(new BufferedOutputToken(token.state, token.term.toString(), startNode, inputEndNode));
+ }
+
+ nextNodeOut = endNode;
+
+ // Do full side-path for each syn output:
+ for(int pathID=0;pathID path = paths.get(pathID);
+ if (path.size() > 1) {
+ int lastNode = outputBuffer.get(pathID).endNode;
+ for(int i=1;i 1) {
+ // Do full "side path" with the original tokens:
+ int lastNode = outputBuffer.get(paths.size()).endNode;
+ for(int i=1;i scheduledFuture;
+
+ private final String location;
+ private final boolean ignoreCase;
+ private final boolean expand;
+ private final String format;
+ private final int interval;
+
+ private SynonymMap synonymMap;
+ private Map dynamicSynonymFilters = new WeakHashMap();
+ private static ConcurrentMap instanceMap = new ConcurrentHashMap<>();
+
+ public synchronized static DynamicSynonymGraphTokenFilterFactory getInstance(
+ IndexSettings indexSettings,
+ Environment env,
+ String name,
+ Settings settings,
+ AnalysisRegistry analysisRegistry
+ ) throws IOException {
+
+ String location = settings.get("synonyms_path");
+ if (location == null) {
+ throw new IllegalArgumentException(
+ "dynamic synonym requires `synonyms_path` to be configured");
+ }
+
+ DynamicSynonymGraphTokenFilterFactory instance = instanceMap.get(location);
+ if( instance != null ){
+ return instance;
+ }
+ instance = new DynamicSynonymGraphTokenFilterFactory(
+ indexSettings,
+ env,
+ name,
+ settings,
+ analysisRegistry
+ );
+ instanceMap.put(location,instance);
+ return instance;
+ }
+
+ public DynamicSynonymGraphTokenFilterFactory(
+ IndexSettings indexSettings,
+ Environment env,
+ String name,
+ Settings settings,
+ AnalysisRegistry analysisRegistry
+ ) throws IOException {
+
+ super(indexSettings, name, settings);
+
+
+ this.location = settings.get("synonyms_path");
+ if (this.location == null) {
+ throw new IllegalArgumentException(
+ "dynamic synonym requires `synonyms_path` to be configured");
+ }
+
+ this.interval = settings.getAsInt("interval", 60);
+ this.ignoreCase = settings.getAsBoolean("ignore_case", false);
+ this.expand = settings.getAsBoolean("expand", true);
+ this.format = settings.get("format", "");
+
+ String tokenizerName = settings.get("tokenizer", "whitespace");
+
+
+ AnalysisModule.AnalysisProvider tokenizerFactoryFactory =
+ analysisRegistry.getTokenizerProvider(tokenizerName, indexSettings);
+ if (tokenizerFactoryFactory == null) {
+ throw new IllegalArgumentException("failed to find tokenizer [" + tokenizerName + "] for synonym token filter");
+ }
+ final TokenizerFactory tokenizerFactory = tokenizerFactoryFactory.get(indexSettings, env, tokenizerName,
+
+ AnalysisRegistry.getSettingsFromIndexSettings(indexSettings, AnalysisRegistry.INDEX_ANALYSIS_TOKENIZER + "." + tokenizerName));
+
+
+
+
+ Analyzer analyzer = new Analyzer() {
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName) {
+ Tokenizer tokenizer = tokenizerFactory == null ? new WhitespaceTokenizer() : tokenizerFactory.create();
+ TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
+ return new TokenStreamComponents(tokenizer, stream);
+ }
+ };
+
+ SynonymFile synonymFile;
+ if (location.startsWith("http://") || location.startsWith("https://")) {
+ synonymFile = new RemoteSynonymFile(env, analyzer, expand, format,
+ location);
+ } else {
+ synonymFile = new LocalSynonymFile(env, analyzer, expand, format,
+ location);
+ }
+ synonymMap = synonymFile.reloadSynonymMap();
+
+ scheduledFuture = pool.scheduleAtFixedRate(new Monitor(synonymFile),
+ interval, interval, TimeUnit.SECONDS);
+
+ }
+
+
+
+ @Override
+ public TokenStream create(TokenStream tokenStream) {
+ DynamicSynonymGraphFilter dynamicSynonymFilter = new DynamicSynonymGraphFilter(
+ tokenStream, synonymMap, ignoreCase);
+ dynamicSynonymFilters.put(dynamicSynonymFilter, 1);
+
+ // fst is null means no synonyms
+ return synonymMap.fst == null ? tokenStream : dynamicSynonymFilter;
+ }
+
+ public class Monitor implements Runnable {
+
+ private SynonymFile synonymFile;
+
+ public Monitor(SynonymFile synonymFile) {
+ this.synonymFile = synonymFile;
+ }
+
+ @Override
+ public void run() {
+ if (synonymFile.isNeedReloadSynonymMap()) {
+ synonymMap = synonymFile.reloadSynonymMap();
+ for (DynamicSynonymGraphFilter dynamicSynonymFilter : dynamicSynonymFilters
+ .keySet()) {
+ dynamicSynonymFilter.update(synonymMap);
+ logger.info("success reload synonym");
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java
index a490580..8a7b376 100644
--- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java
+++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/DynamicSynonymTokenFilterFactory.java
@@ -1,25 +1,14 @@
package com.bellszhu.elasticsearch.plugin.synonym.analysis;
-import java.util.Map;
-import java.util.WeakHashMap;
-import java.util.concurrent.Executors;
-import java.util.concurrent.ScheduledExecutorService;
-import java.util.concurrent.ScheduledFuture;
-import java.util.concurrent.ThreadFactory;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-
-
+import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
-import org.elasticsearch.common.logging.ESLoggerFactory;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
@@ -29,6 +18,10 @@
import org.elasticsearch.indices.analysis.AnalysisModule;
import java.io.IOException;
+import java.util.Map;
+import java.util.WeakHashMap;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.AtomicInteger;
/**
*
@@ -38,8 +31,7 @@
public class DynamicSynonymTokenFilterFactory extends
AbstractTokenFilterFactory {
- public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym");
-
+ private final Logger logger = LogManager.getLogger("dynamic-synonym");
/**
* 静态的id生成器
*/
@@ -61,9 +53,40 @@ public Thread newThread(Runnable r) {
private final boolean expand;
private final String format;
private final int interval;
+ private final boolean ignoreOffset;
private SynonymMap synonymMap;
private Map dynamicSynonymFilters = new WeakHashMap();
+ private static ConcurrentMap instanceMap = new ConcurrentHashMap<>();
+
+ public synchronized static DynamicSynonymTokenFilterFactory getInstance(
+ IndexSettings indexSettings,
+ Environment env,
+ String name,
+ Settings settings,
+ AnalysisRegistry analysisRegistry
+ ) throws IOException {
+
+ String location = settings.get("synonyms_path");
+ if (location == null) {
+ throw new IllegalArgumentException(
+ "dynamic synonym requires `synonyms_path` to be configured");
+ }
+
+ DynamicSynonymTokenFilterFactory instance = instanceMap.get(location);
+ if( instance != null ){
+ return instance;
+ }
+ instance = new DynamicSynonymTokenFilterFactory(
+ indexSettings,
+ env,
+ name,
+ settings,
+ analysisRegistry
+ );
+ instanceMap.put(location,instance);
+ return instance;
+ }
public DynamicSynonymTokenFilterFactory(
IndexSettings indexSettings,
@@ -86,6 +109,7 @@ public DynamicSynonymTokenFilterFactory(
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.expand = settings.getAsBoolean("expand", true);
this.format = settings.get("format", "");
+ this.ignoreOffset = settings.getAsBoolean("ignore_offset", true);
String tokenizerName = settings.get("tokenizer", "whitespace");
@@ -131,7 +155,7 @@ protected TokenStreamComponents createComponents(String fieldName) {
@Override
public TokenStream create(TokenStream tokenStream) {
DynamicSynonymFilter dynamicSynonymFilter = new DynamicSynonymFilter(
- tokenStream, synonymMap, ignoreCase);
+ tokenStream, synonymMap, ignoreCase,ignoreOffset);
dynamicSynonymFilters.put(dynamicSynonymFilter, 1);
// fst is null means no synonyms
diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java
index eb98846..c659043 100644
--- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java
+++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/LocalSynonymFile.java
@@ -3,22 +3,18 @@
*/
package com.bellszhu.elasticsearch.plugin.synonym.analysis;
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.Reader;
-import java.nio.file.Path;
-
import org.apache.commons.codec.Charsets;
+import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
-import org.elasticsearch.common.io.FileSystemUtils;
-import org.elasticsearch.common.logging.ESLoggerFactory;
import org.elasticsearch.env.Environment;
+import java.io.*;
+import java.nio.file.Path;
+
/**
* @author bellszhu
@@ -26,7 +22,7 @@
*/
public class LocalSynonymFile implements SynonymFile {
- public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym");
+ private final Logger logger = LogManager.getLogger("dynamic-synonym");
private String format;
@@ -82,19 +78,14 @@ public Reader getReader() {
Reader reader = null;
BufferedReader br = null;
try {
- reader = FileSystemUtils.newBufferedReader(
- synonymFilePath.toUri().toURL(), Charsets.UTF_8);
- /*
- br = new BufferedReader(new InputStreamReader(
- synonymFileURL.openStream(), Charsets.UTF_8));
- StringBuffer sb = new StringBuffer("");
- String line = null;
+ br = new BufferedReader(new InputStreamReader(synonymFilePath.toUri().toURL().openStream(), Charsets.UTF_8));
+ StringBuffer sb = new StringBuffer();
+ String line;
while ((line = br.readLine()) != null) {
logger.info("reload local synonym: {}", line);
sb.append(line).append(System.getProperty("line.separator"));
}
- reader = new FastStringReader(sb.toString());
- */
+ reader = new StringReader(sb.toString());
} catch (IOException e) {
logger.error("get local synonym reader {} error!", e, location);
throw new IllegalArgumentException(
diff --git a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java
index 4cb402a..05cc6dd 100644
--- a/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java
+++ b/src/main/java/com/bellszhu/elasticsearch/plugin/synonym/analysis/RemoteSynonymFile.java
@@ -9,18 +9,21 @@
import org.apache.http.client.methods.HttpHead;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
+import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.synonym.SolrSynonymParser;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.WordnetSynonymParser;
-import org.elasticsearch.common.logging.ESLoggerFactory;
+import org.elasticsearch.SpecialPermission;
import org.elasticsearch.env.Environment;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
/**
* @author bellszhu
@@ -28,7 +31,7 @@
*/
public class RemoteSynonymFile implements SynonymFile {
- public static Logger logger = ESLoggerFactory.getLogger("dynamic-synonym");
+ private final Logger logger = LogManager.getLogger("dynamic-synonym");
private CloseableHttpClient httpclient = HttpClients.createDefault();
@@ -61,7 +64,14 @@ public RemoteSynonymFile(Environment env, Analyzer analyzer,
}
@Override
- public SynonymMap reloadSynonymMap() {
+ public SynonymMap reloadSynonymMap(){
+ SpecialPermission.check();
+ return AccessController.doPrivileged((PrivilegedAction) () -> {
+ return reloadSynonymMapUnprivileged();
+ });
+ }
+
+ public SynonymMap reloadSynonymMapUnprivileged() {
Reader rulesReader = null;
try {
logger.info("start reload remote synonym from {}.", location);
@@ -93,10 +103,18 @@ public SynonymMap reloadSynonymMap() {
}
}
+ @Override
+ public Reader getReader() {
+ SpecialPermission.check();
+ return AccessController.doPrivileged((PrivilegedAction) () -> {
+ return getReaderUnprivileged();
+ });
+ }
+
/**
* 从远程服务器上下载自定义词条
*/
- public Reader getReader() {
+ public Reader getReaderUnprivileged() {
Reader reader = null;
RequestConfig rc = RequestConfig.custom()
.setConnectionRequestTimeout(10 * 1000)
@@ -155,6 +173,13 @@ public Reader getReader() {
@Override
public boolean isNeedReloadSynonymMap() {
+ SpecialPermission.check();
+ return AccessController.doPrivileged((PrivilegedAction) () -> {
+ return isNeedReloadSynonymMapUnprivileged();
+ });
+ }
+
+ public boolean isNeedReloadSynonymMapUnprivileged() {
RequestConfig rc = RequestConfig.custom()
.setConnectionRequestTimeout(10 * 1000)
.setConnectTimeout(10 * 1000).setSocketTimeout(15 * 1000)
diff --git a/src/main/resources/plugin-descriptor.properties b/src/main/resources/plugin-descriptor.properties
index 29b3c64..58226c3 100644
--- a/src/main/resources/plugin-descriptor.properties
+++ b/src/main/resources/plugin-descriptor.properties
@@ -43,7 +43,7 @@ name=${elasticsearch.plugin.name}
#
# 'site': set to true to indicate contents of the _site/
# directory in the root of the plugin should be served.
-site=${elasticsearch.plugin.site}
+#site=${elasticsearch.plugin.site}
#
### mandatory elements for jvm plugins :
#
@@ -52,7 +52,7 @@ site=${elasticsearch.plugin.site}
# Note that only jar files in the root directory are
# added to the classpath for the plugin! If you need
# other resources, package them into a resources jar.
-jvm=${elasticsearch.plugin.jvm}
+#jvm=${elasticsearch.plugin.jvm}
#
# 'classname': the name of the class to load, fully-qualified.
classname=${elasticsearch.plugin.classname}
@@ -76,5 +76,5 @@ elasticsearch.version=${elasticsearch.version}
# passing false is deprecated, and only intended to support plugins
# that have hard dependencies against each other. If this is
# not specified, then the plugin is isolated by default.
-isolated=${elasticsearch.plugin.isolated}
+#isolated=${elasticsearch.plugin.isolated}
#
\ No newline at end of file
diff --git a/src/main/resources/plugin-security.policy b/src/main/resources/plugin-security.policy
new file mode 100644
index 0000000..55d759a
--- /dev/null
+++ b/src/main/resources/plugin-security.policy
@@ -0,0 +1,4 @@
+grant {
+ // needed because of the hot reload functionality
+ permission java.net.SocketPermission "*", "connect,resolve";
+};
\ No newline at end of file