ports/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
Jung-uk Kim fa023c7e1a - Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7

- Fix a buffer overflow in CJKAnalyzer.  Somehow the upstream missed this
in 2.3.3.4 branch.

http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630

- Fix potential memory leaks in libstemmer.  Merged from Snowball changes.

http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch

- Implement SnowballAnalyzer::reusableTokenStream(). [1]  Also, this patch
fixes memory leaks found by the submitter.

Submitted by:	Kishore Ramareddy (kishore at niksun dot com)
		(initial version) [1]
Feature safe:	yes
2013-04-16 18:37:03 +00:00

74 lines
2.5 KiB
C++

--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig 2011-03-16 20:21:07.000000000 -0400
+++ src/contribs-lib/CLucene/snowball/Snowball.cpp 2013-04-01 19:14:15.000000000 -0400
@@ -19,16 +19,31 @@
CL_NS_DEF2(analysis,snowball)
+ class SnowballAnalyzer::SavedStreams : public TokenStream {
+ public:
+ StandardTokenizer* tokenStream;
+ TokenStream* filteredTokenStream;
+
+ SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
+ void close(){}
+ Token* next(Token* token) {return NULL;}
+ };
+
/** Builds the named analyzer with no stop words. */
SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
this->language = STRDUP_TtoT(language);
stopSet = NULL;
}
- SnowballAnalyzer::~SnowballAnalyzer(){
- _CLDELETE_CARRAY(language);
- if ( stopSet != NULL )
- _CLDELETE(stopSet);
+ SnowballAnalyzer::~SnowballAnalyzer() {
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+ if (streams != NULL) {
+ _CLDELETE(streams->filteredTokenStream);
+ _CLDELETE(streams);
+ }
+ _CLDELETE_CARRAY(language);
+ if (stopSet != NULL)
+ _CLDELETE(stopSet);
}
/** Builds the named analyzer with the given stop words.
@@ -62,12 +77,29 @@
result = _CLNEW SnowballFilter(result, language, true);
return result;
}
-
-
-
-
-
-
+
+ TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
+ SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
+
+ if (streams == NULL) {
+ streams = _CLNEW SavedStreams();
+ BufferedReader* bufferedReader = reader->__asBufferedReader();
+
+ if (bufferedReader == NULL)
+ streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
+ else
+ streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
+
+ streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
+ streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
+ if (stopSet != NULL)
+ streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
+ streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
+ setPreviousTokenStream(streams);
+ } else
+ streams->tokenStream->reset(reader);
+ return streams->filteredTokenStream;
+ }
/** Construct the named stemming filter.
*