mirror of
https://git.freebsd.org/ports.git
synced 2025-04-28 09:36:41 -04:00
textproc/amberfish: update to 1.7.1, take maintainership
- chase to new upstream - always install man pages as per policy - license changed to MIT - submitter becomes maintainer - turn static REINPLACE_CMD use into patches Changelog: https://gitlab.com/amberfish/amberfish/-/releases PR: 282880
This commit is contained in:
parent
63202843ff
commit
13e3d18eb6
6 changed files with 72 additions and 494 deletions
|
@ -1,54 +1,46 @@
|
||||||
PORTNAME= amberfish
|
PORTNAME= amberfish
|
||||||
PORTVERSION= 1.6.4
|
DISTVERSIONPREFIX= v
|
||||||
PORTREVISION= 3
|
DISTVERSION= 1.7.1
|
||||||
CATEGORIES= textproc databases
|
CATEGORIES= textproc databases
|
||||||
MASTER_SITES= SF/${PORTNAME}/Amberfish%20source%20-%20stable/${PORTVERSION} \
|
|
||||||
http://etymon.com/software/amberfish/stable/
|
|
||||||
|
|
||||||
MAINTAINER= ports@FreeBSD.org
|
MAINTAINER= nrn@etymon.com
|
||||||
COMMENT= General purpose text retrieval Software
|
COMMENT= Full-text search engine with command-line interface
|
||||||
WWW= https://web.archive.org/web/20100419215307/http://www.etymon.com/tr.html
|
WWW= https://gitlab.com/amberfish/amberfish
|
||||||
|
|
||||||
LICENSE= GPLv2
|
LICENSE= MIT
|
||||||
LICENSE_FILE= ${WRKSRC}/COPYING
|
LICENSE_FILE= ${WRKSRC}/LICENSE
|
||||||
|
|
||||||
LIB_DEPENDS= libxerces-c.so:textproc/xerces-c3
|
LIB_DEPENDS= libxerces-c.so:textproc/xerces-c3
|
||||||
|
|
||||||
USES= gmake
|
USES= gmake
|
||||||
|
USE_GITLAB= yes
|
||||||
|
|
||||||
GNU_CONFIGURE= yes
|
GNU_CONFIGURE= yes
|
||||||
GNU_CONFIGURE_MANPREFIX=${PREFIX}/share
|
ALL_TARGET= all
|
||||||
|
|
||||||
ALL_TARGET= all html
|
PLIST_FILES= bin/af \
|
||||||
|
share/man/man1/af.1.gz \
|
||||||
PLIST_FILES= bin/af
|
share/man/man3/afclose.3.gz \
|
||||||
PORTDOCS= *
|
share/man/man3/afgetresultmd.3.gz \
|
||||||
|
share/man/man3/afopen.3.gz \
|
||||||
|
share/man/man3/afsearch.3.gz \
|
||||||
|
share/man/man3/afsortdocid.3.gz \
|
||||||
|
share/man/man3/afsortscore.3.gz
|
||||||
|
PORTDOCS= amberfish.html
|
||||||
|
|
||||||
OPTIONS_DEFINE= DOCS
|
OPTIONS_DEFINE= DOCS
|
||||||
|
DOCS_BUILD_DEPENDS= asciidoctor:textproc/rubygem-asciidoctor
|
||||||
|
DOCS_ALL_TARGET= html
|
||||||
|
|
||||||
DOCS_USES= makeinfo
|
post-configure:
|
||||||
DOCS_PLIST_FILES= share/man/man1/af.1.gz
|
${ECHO_CMD} "#define AF_VERSION \"v${DISTVERSION}\"" > ${WRKSRC}/src/backend/version.h
|
||||||
|
${ECHO_CMD} v${DISTVERSION} > ${WRKSRC}/doc/version.adoc
|
||||||
|
|
||||||
post-extract:
|
post-install:
|
||||||
${CP} ${FILESDIR}/porter.cc ${WRKSRC}/src
|
${STRIP_CMD} ${STAGEDIR}${PREFIX}/bin/af
|
||||||
|
|
||||||
post-patch:
|
|
||||||
@${REINPLACE_CMD} -e \
|
|
||||||
's|$${MAKEFLAGS} ||' ${WRKSRC}/Makefile
|
|
||||||
@${REINPLACE_CMD} -e \
|
|
||||||
's|cp |$${BSD_INSTALL_MAN} |' ${WRKSRC}/doc/Makefile.in
|
|
||||||
@${REINPLACE_CMD} -e \
|
|
||||||
's|-O3 |@CFLAGS@ | ; \
|
|
||||||
s|make strip|| ; \
|
|
||||||
s|cp |$${BSD_INSTALL_PROGRAM} |' ${WRKSRC}/src/Makefile.in
|
|
||||||
|
|
||||||
post-patch-DOCS-off:
|
|
||||||
@${REINPLACE_CMD} -e \
|
|
||||||
'/cd doc/d' ${WRKSRC}/Makefile
|
|
||||||
|
|
||||||
post-install-DOCS-on:
|
post-install-DOCS-on:
|
||||||
@${MKDIR} ${STAGEDIR}${DOCSDIR}
|
${MKDIR} ${STAGEDIR}${DOCSDIR}
|
||||||
${INSTALL_DATA} ${WRKSRC}/amberfish.png ${STAGEDIR}${DOCSDIR}
|
${INSTALL_DATA} ${WRKSRC}/doc/amberfish.html ${STAGEDIR}${DOCSDIR}
|
||||||
${INSTALL_DATA} ${WRKSRC}/doc/html/*.html ${STAGEDIR}${DOCSDIR}
|
|
||||||
|
|
||||||
.include <bsd.port.mk>
|
.include <bsd.port.mk>
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
SHA256 (amberfish-1.6.4.tar.gz) = 155ac6e6b9b76fb7cbd94952548f718ab6add72c3b4fd2482d89abb39d96ce76
|
TIMESTAMP = 1732616395
|
||||||
SIZE (amberfish-1.6.4.tar.gz) = 127198
|
SHA256 (amberfish-v1.7.1.tar.bz2) = 67c8b007be4652ceaafe0d93c9ac6ef40541e6163f820f8271d02704817af9a0
|
||||||
|
SIZE (amberfish-v1.7.1.tar.bz2) = 117241
|
||||||
|
|
26
textproc/amberfish/files/patch-Makefile
Normal file
26
textproc/amberfish/files/patch-Makefile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
--- Makefile.orig 2024-11-23 13:45:47 UTC
|
||||||
|
+++ Makefile
|
||||||
|
@@ -16,18 +16,18 @@ strip:
|
||||||
|
cd doc ; ${MAKE} html
|
||||||
|
|
||||||
|
strip:
|
||||||
|
- cd src/backend ; ${MAKE} ${MAKEFLAGS} strip
|
||||||
|
+ cd src/backend ; ${MAKE} strip
|
||||||
|
# cd src/interface ; ${MAKE} ${MAKEFLAGS} strip
|
||||||
|
|
||||||
|
install:
|
||||||
|
- cd src/backend ; ${MAKE} ${MAKEFLAGS} install
|
||||||
|
+ cd src/backend ; ${MAKE} install
|
||||||
|
# cd src/interface ; ${MAKE} ${MAKEFLAGS} install
|
||||||
|
- cd doc ; ${MAKE} ${MAKEFLAGS} install
|
||||||
|
+ cd doc ; ${MAKE} install
|
||||||
|
|
||||||
|
uninstall:
|
||||||
|
- cd src/backend ; ${MAKE} ${MAKEFLAGS} uninstall
|
||||||
|
+ cd src/backend ; ${MAKE} uninstall
|
||||||
|
# cd src/interface ; ${MAKE} ${MAKEFLAGS} uninstall
|
||||||
|
- cd doc ; ${MAKE} ${MAKEFLAGS} uninstall
|
||||||
|
+ cd doc ; ${MAKE} uninstall
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -fr autom4te.cache
|
10
textproc/amberfish/files/patch-src_backend_Makefile.in
Normal file
10
textproc/amberfish/files/patch-src_backend_Makefile.in
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
--- src/backend/Makefile.in.orig 2024-11-23 13:47:04 UTC
|
||||||
|
+++ src/backend/Makefile.in
|
||||||
|
@@ -62,7 +62,6 @@ install: all
|
||||||
|
strip ${AF}
|
||||||
|
|
||||||
|
install: all
|
||||||
|
- make strip
|
||||||
|
mkdir -p ${PREFIXBIN}
|
||||||
|
cp ${BIN} ${PREFIXBIN}/.
|
||||||
|
|
|
@ -1,438 +0,0 @@
|
||||||
|
|
||||||
/* This is the Porter stemming algorithm, coded up in ANSI C by the
|
|
||||||
author. It may be be regarded as cononical, in that it follows the
|
|
||||||
algorithm presented in
|
|
||||||
|
|
||||||
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
||||||
no. 3, pp 130-137,
|
|
||||||
|
|
||||||
only differing from it at the points maked --DEPARTURE-- below.
|
|
||||||
|
|
||||||
See also http://www.tartarus.org/~martin/PorterStemmer
|
|
||||||
|
|
||||||
The algorithm as described in the paper could be exactly replicated
|
|
||||||
by adjusting the points of DEPARTURE, but this is barely necessary,
|
|
||||||
because (a) the points of DEPARTURE are definitely improvements, and
|
|
||||||
(b) no encoding of the Porter stemmer I have seen is anything like
|
|
||||||
as exact as this version, even with the points of DEPARTURE!
|
|
||||||
|
|
||||||
You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
|
|
||||||
'stem' takes a list of inputs and sends the stemmed equivalent to
|
|
||||||
stdout.
|
|
||||||
|
|
||||||
The algorithm as encoded here is particularly fast.
|
|
||||||
|
|
||||||
Release 1
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <string.h> /* for memmove */
|
|
||||||
|
|
||||||
#define TRUE 1
|
|
||||||
#define FALSE 0
|
|
||||||
|
|
||||||
/* The main part of the stemming algorithm starts here. b is a buffer
|
|
||||||
holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
|
|
||||||
ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
|
|
||||||
downwards as the stemming progresses. Zero termination is not in fact
|
|
||||||
used in the algorithm.
|
|
||||||
|
|
||||||
Note that only lower case sequences are stemmed. Forcing to lower case
|
|
||||||
should be done before stem(...) is called.
|
|
||||||
*/
|
|
||||||
|
|
||||||
static char * b; /* buffer for word to be stemmed */
|
|
||||||
static int k,k0,j; /* j is a general offset into the string */
|
|
||||||
|
|
||||||
/* cons(i) is TRUE <=> b[i] is a consonant. */
|
|
||||||
|
|
||||||
static int cons(int i)
|
|
||||||
{
|
|
||||||
switch (b[i])
|
|
||||||
{
|
|
||||||
case 'a': case 'e': case 'i': case 'o': case 'u': return FALSE;
|
|
||||||
case 'y': return (i==k0) ? TRUE : !cons(i-1);
|
|
||||||
default: return TRUE;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* m() measures the number of consonant sequences between k0 and j. if c is
|
|
||||||
a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
|
|
||||||
presence,
|
|
||||||
|
|
||||||
<c><v> gives 0
|
|
||||||
<c>vc<v> gives 1
|
|
||||||
<c>vcvc<v> gives 2
|
|
||||||
<c>vcvcvc<v> gives 3
|
|
||||||
....
|
|
||||||
*/
|
|
||||||
|
|
||||||
static int m()
|
|
||||||
{
|
|
||||||
int n = 0;
|
|
||||||
int i = k0;
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
if (i > j) return n;
|
|
||||||
if (! cons(i)) break; i++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
if (i > j) return n;
|
|
||||||
if (cons(i)) break;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
n++;
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
if (i > j) return n;
|
|
||||||
if (! cons(i)) break;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* vowelinstem() is TRUE <=> k0,...j contains a vowel */
|
|
||||||
|
|
||||||
static int vowelinstem()
|
|
||||||
{
|
|
||||||
int i; for (i = k0; i <= j; i++) if (! cons(i)) return TRUE;
|
|
||||||
return FALSE;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* doublec(j) is TRUE <=> j,(j-1) contain a double consonant. */
|
|
||||||
|
|
||||||
static int doublec(int j)
|
|
||||||
{
|
|
||||||
if (j < k0+1) return FALSE;
|
|
||||||
if (b[j] != b[j-1]) return FALSE;
|
|
||||||
return cons(j);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
|
|
||||||
and also if the second c is not w,x or y. this is used when trying to
|
|
||||||
restore an e at the end of a short word. e.g.
|
|
||||||
|
|
||||||
cav(e), lov(e), hop(e), crim(e), but
|
|
||||||
snow, box, tray.
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
static int cvc(int i)
|
|
||||||
{
|
|
||||||
if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) return FALSE;
|
|
||||||
{
|
|
||||||
int ch = b[i];
|
|
||||||
if (ch == 'w' || ch == 'x' || ch == 'y') return FALSE;
|
|
||||||
}
|
|
||||||
return TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* ends(s) is TRUE <=> k0,...k ends with the string s. */
|
|
||||||
|
|
||||||
static int ends(char * s)
|
|
||||||
{
|
|
||||||
int length = s[0];
|
|
||||||
if (s[length] != b[k]) return FALSE; /* tiny speed-up */
|
|
||||||
if (length > k-k0+1) return FALSE;
|
|
||||||
if (memcmp(b+k-length+1,s+1,length) != 0) return FALSE;
|
|
||||||
j = k-length;
|
|
||||||
return TRUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting
|
|
||||||
k. */
|
|
||||||
|
|
||||||
static void setto(char * s)
|
|
||||||
{
|
|
||||||
int length = s[0];
|
|
||||||
memmove(b+j+1,s+1,length);
|
|
||||||
k = j+length;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* r(s) is used further down. */
|
|
||||||
|
|
||||||
static void r(char * s) { if (m() > 0) setto(s); }
|
|
||||||
|
|
||||||
/* step1ab() gets rid of plurals and -ed or -ing. e.g.
|
|
||||||
|
|
||||||
caresses -> caress
|
|
||||||
ponies -> poni
|
|
||||||
ties -> ti
|
|
||||||
caress -> caress
|
|
||||||
cats -> cat
|
|
||||||
|
|
||||||
feed -> feed
|
|
||||||
agreed -> agree
|
|
||||||
disabled -> disable
|
|
||||||
|
|
||||||
matting -> mat
|
|
||||||
mating -> mate
|
|
||||||
meeting -> meet
|
|
||||||
milling -> mill
|
|
||||||
messing -> mess
|
|
||||||
|
|
||||||
meetings -> meet
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
static void step1ab()
|
|
||||||
{
|
|
||||||
if (b[k] == 's')
|
|
||||||
{
|
|
||||||
if (ends("\04" "sses")) k -= 2; else
|
|
||||||
if (ends("\03" "ies")) setto("\01" "i"); else
|
|
||||||
if (b[k-1] != 's') k--;
|
|
||||||
}
|
|
||||||
if (ends("\03" "eed")) { if (m() > 0) k--; }
|
|
||||||
else
|
|
||||||
if ((ends("\02" "ed") || ends("\03" "ing")) && vowelinstem())
|
|
||||||
{
|
|
||||||
k = j;
|
|
||||||
if (ends("\02" "at")) setto("\03" "ate"); else
|
|
||||||
if (ends("\02" "bl")) setto("\03" "ble"); else
|
|
||||||
if (ends("\02" "iz")) setto("\03" "ize"); else
|
|
||||||
if (doublec(k))
|
|
||||||
{
|
|
||||||
k--;
|
|
||||||
{
|
|
||||||
int ch = b[k];
|
|
||||||
if (ch == 'l' || ch == 's' || ch == 'z') k++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (m() == 1 && cvc(k)) setto("\01" "e");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* step1c() turns terminal y to i when there is another vowel in the stem. */
|
|
||||||
|
|
||||||
static void step1c() { if (ends("\01" "y") && vowelinstem()) b[k] = 'i'; }
|
|
||||||
|
|
||||||
/* step2() maps double suffices to single ones. so -ization ( = -ize plus
|
|
||||||
-ation) maps to -ize etc. note that the string before the suffix must give
|
|
||||||
m() > 0. */
|
|
||||||
|
|
||||||
static void step2()
|
|
||||||
{
|
|
||||||
switch (b[k-1])
|
|
||||||
{
|
|
||||||
case 'a': if (ends("\07" "ational")) { r("\03" "ate"); break; }
|
|
||||||
if (ends("\06" "tional")) { r("\04" "tion"); break; }
|
|
||||||
break;
|
|
||||||
case 'c': if (ends("\04" "enci")) { r("\04" "ence"); break; }
|
|
||||||
if (ends("\04" "anci")) { r("\04" "ance"); break; }
|
|
||||||
break;
|
|
||||||
case 'e': if (ends("\04" "izer")) { r("\03" "ize"); break; }
|
|
||||||
break;
|
|
||||||
case 'l': if (ends("\03" "bli")) /*-DEPARTURE-*/
|
|
||||||
{
|
|
||||||
r("\03" "ble"); break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* To match the published algorithm, replace this line with
|
|
||||||
case 'l': if (ends("\04" "abli")) { r("\04" "able"); break; } */
|
|
||||||
|
|
||||||
if (ends("\04" "alli")) { r("\02" "al"); break; }
|
|
||||||
if (ends("\05" "entli")) { r("\03" "ent"); break; }
|
|
||||||
if (ends("\03" "eli")) { r("\01" "e"); break; }
|
|
||||||
if (ends("\05" "ousli")) { r("\03" "ous"); break; }
|
|
||||||
break;
|
|
||||||
case 'o': if (ends("\07" "ization")) { r("\03" "ize"); break; }
|
|
||||||
if (ends("\05" "ation")) { r("\03" "ate"); break; }
|
|
||||||
if (ends("\04" "ator")) { r("\03" "ate"); break; }
|
|
||||||
break;
|
|
||||||
case 's': if (ends("\05" "alism")) { r("\02" "al"); break; }
|
|
||||||
if (ends("\07" "iveness")) { r("\03" "ive"); break; }
|
|
||||||
if (ends("\07" "fulness")) { r("\03" "ful"); break; }
|
|
||||||
if (ends("\07" "ousness")) { r("\03" "ous"); break; }
|
|
||||||
break;
|
|
||||||
case 't': if (ends("\05" "aliti")) { r("\02" "al"); break; }
|
|
||||||
if (ends("\05" "iviti")) { r("\03" "ive"); break; }
|
|
||||||
if (ends("\06" "biliti")) { r("\03" "ble"); break; }
|
|
||||||
break;
|
|
||||||
case 'g': if (ends("\04" "logi")) /*-DEPARTURE-*/
|
|
||||||
{
|
|
||||||
r("\03" "log"); break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* To match the published algorithm, delete this line */
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* step3() deals with -ic-, -full, -ness etc. similar strategy to step2. */
|
|
||||||
|
|
||||||
static void step3()
|
|
||||||
{
|
|
||||||
switch (b[k])
|
|
||||||
{
|
|
||||||
case 'e': if (ends("\05" "icate")) { r("\02" "ic"); break; }
|
|
||||||
if (ends("\05" "ative")) { r("\00" ""); break; }
|
|
||||||
if (ends("\05" "alize")) { r("\02" "al"); break; }
|
|
||||||
break;
|
|
||||||
case 'i': if (ends("\05" "iciti")) { r("\02" "ic"); break; }
|
|
||||||
break;
|
|
||||||
case 'l': if (ends("\04" "ical")) { r("\02" "ic"); break; }
|
|
||||||
if (ends("\03" "ful")) { r("\00" ""); break; }
|
|
||||||
break;
|
|
||||||
case 's': if (ends("\04" "ness")) { r("\00" ""); break; }
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* step4() takes off -ant, -ence etc., in context <c>vcvc<v>. */
|
|
||||||
|
|
||||||
static void step4()
|
|
||||||
{
|
|
||||||
switch (b[k-1])
|
|
||||||
{
|
|
||||||
case 'a': if (ends("\02" "al")) break; return;
|
|
||||||
case 'c': if (ends("\04" "ance")) break;
|
|
||||||
if (ends("\04" "ence")) break; return;
|
|
||||||
case 'e': if (ends("\02" "er")) break; return;
|
|
||||||
case 'i': if (ends("\02" "ic")) break; return;
|
|
||||||
case 'l': if (ends("\04" "able")) break;
|
|
||||||
if (ends("\04" "ible")) break; return;
|
|
||||||
case 'n': if (ends("\03" "ant")) break;
|
|
||||||
if (ends("\05" "ement")) break;
|
|
||||||
if (ends("\04" "ment")) break;
|
|
||||||
if (ends("\03" "ent")) break; return;
|
|
||||||
case 'o': if (ends("\03" "ion") && (b[j] == 's' || b[j] == 't')) break;
|
|
||||||
if (ends("\02" "ou")) break; return;
|
|
||||||
/* takes care of -ous */
|
|
||||||
case 's': if (ends("\03" "ism")) break; return;
|
|
||||||
case 't': if (ends("\03" "ate")) break;
|
|
||||||
if (ends("\03" "iti")) break; return;
|
|
||||||
case 'u': if (ends("\03" "ous")) break; return;
|
|
||||||
case 'v': if (ends("\03" "ive")) break; return;
|
|
||||||
case 'z': if (ends("\03" "ize")) break; return;
|
|
||||||
default: return;
|
|
||||||
}
|
|
||||||
if (m() > 1) k = j;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* step5() removes a final -e if m() > 1, and changes -ll to -l if
|
|
||||||
m() > 1. */
|
|
||||||
|
|
||||||
static void step5()
|
|
||||||
{
|
|
||||||
j = k;
|
|
||||||
if (b[k] == 'e')
|
|
||||||
{
|
|
||||||
int a = m();
|
|
||||||
if (a > 1 || a == 1 && !cvc(k-1)) k--;
|
|
||||||
}
|
|
||||||
if (b[k] == 'l' && doublec(k) && m() > 1) k--;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* In stem(p,i,j), p is a char pointer, and the string to be stemmed is from
|
|
||||||
p[i] to p[j] inclusive. Typically i is zero and j is the offset to the last
|
|
||||||
character of a string, (p[j+1] == '\0'). The stemmer adjusts the
|
|
||||||
characters p[i] ... p[j] and returns the new end-point of the string, k.
|
|
||||||
Stemming never increases word length, so i <= k <= j. To turn the stemmer
|
|
||||||
into a module, declare 'stem' as extern, and delete the remainder of this
|
|
||||||
file.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int stem(char * p, int i, int j)
|
|
||||||
{ /* copy the parameters into statics */
|
|
||||||
b = p; k = j; k0 = i;
|
|
||||||
if (k <= k0+1) return k; /*-DEPARTURE-*/
|
|
||||||
|
|
||||||
/* With this line, strings of length 1 or 2 don't go through the
|
|
||||||
stemming process, although no mention is made of this in the
|
|
||||||
published algorithm. Remove the line to match the published
|
|
||||||
algorithm. */
|
|
||||||
|
|
||||||
step1ab(); step1c(); step2(); step3(); step4(); step5();
|
|
||||||
return k;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*--------------------stemmer definition ends here------------------------*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h> /* for malloc, free */
|
|
||||||
#include <ctype.h> /* for isupper, islower, tolower */
|
|
||||||
|
|
||||||
static char * s; /* a char * (=string) pointer; passed into b above */
|
|
||||||
|
|
||||||
#define INC 50 /* size units in which s is increased */
|
|
||||||
static int i_max = INC; /* maximum offset in s */
|
|
||||||
|
|
||||||
void increase_s()
|
|
||||||
{
|
|
||||||
i_max += INC;
|
|
||||||
{
|
|
||||||
char * new_s = (char *) malloc(i_max+1);
|
|
||||||
{ /* copy across */
|
|
||||||
int i; for (i = 0; i < i_max; i++) new_s[i] = s[i];
|
|
||||||
}
|
|
||||||
free(s); s = new_s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define LETTER(ch) (isupper(ch) || islower(ch))
|
|
||||||
|
|
||||||
static void stemfile(FILE * f)
|
|
||||||
{
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
int ch = getc(f);
|
|
||||||
if (ch == EOF) return;
|
|
||||||
if (LETTER(ch))
|
|
||||||
{
|
|
||||||
int i = 0;
|
|
||||||
while(TRUE)
|
|
||||||
{
|
|
||||||
if (i == i_max) increase_s();
|
|
||||||
|
|
||||||
ch = tolower(ch); /* forces lower case */
|
|
||||||
|
|
||||||
s[i] = ch; i++;
|
|
||||||
ch = getc(f);
|
|
||||||
if (!LETTER(ch)) { ungetc(ch,f); break; }
|
|
||||||
}
|
|
||||||
s[stem(s,0,i-1)+1] = 0;
|
|
||||||
/* the previous line calls the stemmer and uses its result to
|
|
||||||
zero-terminate the string in s */
|
|
||||||
printf("%s",s);
|
|
||||||
}
|
|
||||||
else putchar(ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Commented out as required by amberfish's INSTALL file
|
|
||||||
*
|
|
||||||
int main(int argc, char * argv[])
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
s = (char *) malloc(i_max+1);
|
|
||||||
for (i = 1; i < argc; i++)
|
|
||||||
{
|
|
||||||
FILE * f = fopen(argv[i],"r");
|
|
||||||
if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
|
|
||||||
stemfile(f);
|
|
||||||
}
|
|
||||||
free(s);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
*/
|
|
|
@ -1,19 +1,6 @@
|
||||||
Amberfish is general purpose text retrieval software, developed at Etymon
|
Amberfish is a full-text search engine with a command-line interface.
|
||||||
by Nassib Nassar and distributed as open source software under the terms
|
Its features include free-text and Boolean queries, relevance-ranked
|
||||||
of version 2 of the GNU General Public License (GPL). Its distinguishing
|
results, wildcard search, phrase search, field search and structured
|
||||||
features are indexing/search of semi-structured text (i.e. both free tex
|
field path queries for XML, multiple documents per file and nested
|
||||||
and multiply nested fields), built-in support for XML documents using the
|
documents, searching across multiple indexes, incremental update of
|
||||||
Xerces library, structured queries allowing generalized field/tag paths,
|
indexes, and low memory requirements for building indexes.
|
||||||
hierarchical result sets (XML only), automatic searching across multiple
|
|
||||||
databases (allowing modular indexing), TREC format results, efficient
|
|
||||||
indexing, and relatively low memory requirements during indexing (and the
|
|
||||||
ability to index documents larger than available memory). Z39.50 support
|
|
||||||
is available. Other features include Boolean queries, right truncation,
|
|
||||||
phrase searching, relevance ranking, support for multiple documents per
|
|
||||||
file, incremental indexing, and easy integration with other UNIX tools,
|
|
||||||
The architecture is also designed to permit proximity queries; however,
|
|
||||||
they are not fully implemented at present.
|
|
||||||
|
|
||||||
This port also includes the Porter stemming algorithm for suffix
|
|
||||||
stripping, available at:
|
|
||||||
http://www.tartarus.org/~martin/PorterStemmer
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue