New port: misc/libpostal: Library for parsing/normalizing street addresses around the world

PR:		224262
Submitted by:	Dmitri Goutnik <dg@syrec.org>
Approved by:	adamw (mentor)
Differential Revision:	https://reviews.freebsd.org/D13468
This commit is contained in:
Yuri Victorovich 2017-12-18 22:43:44 +00:00
parent f0294bafaa
commit 4dbc2863a7
Notes: svn2git 2021-03-31 03:12:20 +00:00
svn path=/head/; revision=456691
11 changed files with 224 additions and 0 deletions

View file

@ -250,6 +250,7 @@
SUBDIR += libisocodes
SUBDIR += libkdeedu
SUBDIR += libmetalink
SUBDIR += libpostal
SUBDIR += libpri
SUBDIR += libsupertone
SUBDIR += libutf

58
misc/libpostal/Makefile Normal file
View file

@ -0,0 +1,58 @@
# $FreeBSD$
PORTNAME= libpostal
DISTVERSIONPREFIX= v
DISTVERSION= 1.0.0
CATEGORIES= misc geography
MAINTAINER= dg@syrec.org
COMMENT= Library for parsing/normalizing street addresses around the world
LICENSE= MIT
LICENSE_FILE= ${WRKSRC}/LICENSE
RUN_DEPENDS= curl:ftp/curl
USES= autoreconf libtool localbase
GNU_CONFIGURE= yes
USE_LDCONFIG= yes
CONFIGURE_ARGS= --datadir=${LIBPOSTAL_DATADIR} \
--disable-data-download
USE_GITHUB= yes
GH_ACCOUNT= openvenues
TEST_TARGET= check
LIBPOSTAL_DATADIR?= /var/db/${PORTNAME}
SUB_FILES= pkg-message
SUB_LIST= PORTNAME=${PORTNAME} \
LIBPOSTAL_DATADIR=${LIBPOSTAL_DATADIR}
PORTDOCS= README.md
OPTIONS_DEFINE= CBLAS DOCS SSE2 STATIC
CBLAS_DESC= Build with CBLAS/OPENBLAS
SSE2_DESC= Enable SSE2 optimization
OPTIONS_SUB= yes
CBLAS_BUILD_DEPENDS= ${LOCALBASE}/include/cblas.h:math/cblas
CBLAS_LIB_DEPENDS= libopenblas.so:math/openblas
CBLAS_CONFIGURE_WITH= cblas=${LOCALBASE}/lib/libopenblas.so
SSE2_CONFIGURE_ENABLE= sse2
STATIC_CONFIGURE_ENABLE= static
pre-configure:
@cd ${WRKSRC} && ${SH} bootstrap.sh
post-install:
@${STRIP_CMD} ${STAGEDIR}${PREFIX}/lib/libpostal.so
${INSTALL_PROGRAM} ${WRKSRC}/src/address_parser ${STAGEDIR}${PREFIX}/bin
${INSTALL_SCRIPT} ${WRKSRC}/src/libpostal ${STAGEDIR}${PREFIX}/bin
post-install-DOCS-on:
@${MKDIR} ${STAGEDIR}${DOCSDIR}
${INSTALL_MAN} ${PORTDOCS:S|^|${WRKSRC}/|} ${STAGEDIR}${DOCSDIR}
.include <bsd.port.mk>

3
misc/libpostal/distinfo Normal file
View file

@ -0,0 +1,3 @@
TIMESTAMP = 1513006366
SHA256 (openvenues-libpostal-v1.0.0_GH0.tar.gz) = 3035af7e15b2894069753975d953fa15a86d968103913dbf8ce4b8aa26231644
SIZE (openvenues-libpostal-v1.0.0_GH0.tar.gz) = 5537587

View file

@ -0,0 +1,11 @@
--- configure.ac.orig 2017-04-07 21:40:27 UTC
+++ configure.ac
@@ -19,7 +19,7 @@ AC_CONFIG_HEADERS([config.h])
AC_PROG_CC_C99
AC_PROG_INSTALL
-LDFLAGS="$LDFLAGS -L/usr/local/lib"
+#LDFLAGS="$LDFLAGS -L/usr/local/lib"
# Checks for libraries.
AC_SEARCH_LIBS([log],

View file

@ -0,0 +1,74 @@
--- src/Makefile.am.orig 2017-04-07 21:40:27 UTC
+++ src/Makefile.am
@@ -1,11 +1,11 @@
# Inherited from autoconf / user-specified
CFLAGS_CONF = @CFLAGS@
-CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g $(CFLAGS_CONF)
+CFLAGS_BASE = -Wall -Wextra -Wno-unused-function -Wformat -Werror=format-security -Winit-self -Wno-sign-compare -DLIBPOSTAL_DATA_DIR='"$(datadir)"' $(CFLAGS_CONF)
CFLAGS_O0 = $(CFLAGS_BASE) -O0
CFLAGS_O1 = $(CFLAGS_BASE) -O1
CFLAGS_O2 = $(CFLAGS_BASE) -O2
CFLAGS_O3 = $(CFLAGS_BASE) -O3
-DEFAULT_INCLUDES = -I.. -I/usr/local/include
+DEFAULT_INCLUDES = -I..
# Wonky but have to be able to override the user's optimization level to compile the scanner
# as it takes an unreasonably long time to compile with the optimizer on.
@@ -14,7 +14,7 @@ CFLAGS =
lib_LTLIBRARIES = libpostal.la
libpostal_la_SOURCES = libpostal.c address_dictionary.c transliterate.c tokens.c trie.c trie_search.c trie_utils.c string_utils.c file_utils.c numex.c utf8proc/utf8proc.c cmp/cmp.c normalize.c features.c unicode_scripts.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c averaged_perceptron_tagger.c graph.c graph_builder.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c float_utils.c ngrams.c
libpostal_la_LIBADD = libscanner.la $(CBLAS_LIBS)
-libpostal_la_CFLAGS = $(CFLAGS_O2)
+libpostal_la_CFLAGS = $(CFLAGS_BASE)
libpostal_la_LDFLAGS = -version-info @LIBPOSTAL_SO_VERSION@
dist_bin_SCRIPTS = libpostal_data
@@ -30,37 +30,37 @@ noinst_PROGRAMS = libpostal bench addres
libpostal_SOURCES = main.c json_encode.c
libpostal_LDADD = libpostal.la
-libpostal_CFLAGS = $(CFLAGS_O3)
+libpostal_CFLAGS = $(CFLAGS_BASE)
bench_SOURCES = bench.c
bench_LDADD = libpostal.la libscanner.la $(CBLAS_LIBS)
-bench_CFLAGS = $(CFLAGS_O3)
+bench_CFLAGS = $(CFLAGS_BASE)
address_parser_SOURCES = address_parser_cli.c json_encode.c linenoise/linenoise.c libpostal.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c numex.c language_classifier.c language_features.c logistic_regression.c logistic.c minibatch.c
address_parser_LDADD = libscanner.la $(CBLAS_LIBS)
-address_parser_CFLAGS = $(CFLAGS_O3)
+address_parser_CFLAGS = $(CFLAGS_BASE)
build_address_dictionary_SOURCES = address_dictionary_builder.c address_dictionary.c file_utils.c string_utils.c trie.c trie_search.c utf8proc/utf8proc.c
-build_address_dictionary_CFLAGS = $(CFLAGS_O3)
+build_address_dictionary_CFLAGS = $(CFLAGS_BASE)
build_numex_table_SOURCES = numex_table_builder.c numex.c file_utils.c string_utils.c tokens.c trie.c trie_search.c utf8proc/utf8proc.c
-build_numex_table_CFLAGS = $(CFLAGS_O3)
+build_numex_table_CFLAGS = $(CFLAGS_BASE)
build_trans_table_SOURCES = transliteration_table_builder.c transliterate.c trie.c trie_search.c file_utils.c string_utils.c utf8proc/utf8proc.c
-build_trans_table_CFLAGS = $(CFLAGS_O3)
+build_trans_table_CFLAGS = $(CFLAGS_BASE)
address_parser_train_SOURCES = address_parser_train.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_trainer.c crf_trainer.c crf_trainer_averaged_perceptron.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c shuffle.c utf8proc/utf8proc.c ngrams.c
address_parser_train_LDADD = libscanner.la $(CBLAS_LIBS)
-address_parser_train_CFLAGS = $(CFLAGS_O3)
+address_parser_train_CFLAGS = $(CFLAGS_BASE)
address_parser_test_SOURCES = address_parser_test.c address_parser.c address_parser_io.c averaged_perceptron.c crf.c crf_context.c sparse_matrix.c graph.c graph_builder.c float_utils.c averaged_perceptron_tagger.c address_dictionary.c normalize.c features.c unicode_scripts.c transliterate.c trie.c trie_search.c trie_utils.c string_utils.c tokens.c file_utils.c utf8proc/utf8proc.c ngrams.c
address_parser_test_LDADD = libscanner.la $(CBLAS_LIBS)
-address_parser_test_CFLAGS = $(CFLAGS_O3)
+address_parser_test_CFLAGS = $(CFLAGS_BASE)
language_classifier_train_SOURCES = language_classifier_train.c language_classifier.c language_features.c language_classifier_io.c logistic_regression_trainer.c logistic_regression.c logistic.c sparse_matrix.c sparse_matrix_utils.c features.c minibatch.c float_utils.c stochastic_gradient_descent.c ftrl.c regularization.c cartesian_product.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c shuffle.c
language_classifier_train_LDADD = libscanner.la $(CBLAS_LIBS)
-language_classifier_train_CFLAGS = $(CFLAGS_O3)
+language_classifier_train_CFLAGS = $(CFLAGS_BASE)
language_classifier_SOURCES = language_classifier_cli.c language_classifier.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
language_classifier_LDADD = libscanner.la $(CBLAS_LIBS)
-language_classifier_CFLAGS = $(CFLAGS_O3)
+language_classifier_CFLAGS = $(CFLAGS_BASE)
language_classifier_test_SOURCES = language_classifier_test.c language_classifier.c language_classifier_io.c language_features.c logistic_regression.c logistic.c sparse_matrix.c features.c minibatch.c float_utils.c normalize.c transliterate.c trie.c trie_search.c trie_utils.c address_dictionary.c string_utils.c file_utils.c utf8proc/utf8proc.c unicode_scripts.c
language_classifier_test_LDADD = libscanner.la $(CBLAS_LIBS)
-language_classifier_test_CFLAGS = $(CFLAGS_O3)
+language_classifier_test_CFLAGS = $(CFLAGS_BASE)
pkginclude_HEADERS = libpostal.h

View file

@ -0,0 +1,23 @@
--- src/libpostal_data.orig 2017-04-07 21:40:27 UTC
+++ src/libpostal_data
@@ -78,9 +78,9 @@ download_multipart() {
else
max=$size;
fi;
- printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
+ printf "%s\0%s\0%s\0%s\0%s\0%s\0%s\0" "x" "$i" "$offset" "$max" "$url" "$part_filename"
offset=$((offset+CHUNK_SIZE))
- done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
+ done | xargs -0 -n 6 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --
> $local_path
@@ -176,6 +176,8 @@ if [ $COMMAND = "download" ]; then
download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $lang_class_s3_prefix $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
fi
+ chown -R root:wheel $LIBPOSTAL_DATA_DIR
+
if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
fi

View file

@ -0,0 +1,15 @@
--- src/sparkey/Makefile.am.orig 2017-04-07 21:40:27 UTC
+++ src/sparkey/Makefile.am
@@ -1,5 +1,5 @@
CFLAGS_CONF = @CFLAGS@
-CFLAGS = -I/usr/local/include -O2 -Wall -Wextra -Wfloat-equal -Wshadow -Wpointer-arith -Werror -pedantic $(CFLAGS_CONF)
+CFLAGS = -Wall -Wextra -Wfloat-equal -Wshadow -Wpointer-arith -Werror -pedantic $(CFLAGS_CONF)
noinst_LTLIBRARIES = libsparkey.la
libsparkey_la_SOURCES = endiantools.h hashheader.h logheader.h \
@@ -8,4 +8,4 @@ logreader.c returncodes.c util.c buf.h h
sparkey.h util.h endiantools.c \
hashheader.c hashreader.c logheader.c logwriter.c MurmurHash3.c \
sparkey-internal.h
-libsparkey_la_LDFLAGS = -L/usr/local/lib
+#libsparkey_la_LDFLAGS = -L/usr/local/lib

View file

@ -0,0 +1,20 @@
--- test/Makefile.am.orig 2017-04-07 21:40:27 UTC
+++ test/Makefile.am
@@ -1,9 +1,9 @@
-CFLAGS_BASE = -Wfloat-equal -Wpointer-arith -std=gnu99 -DLIBPOSTAL_DATA_DIR='"$(datadir)/libpostal"' -g
+CFLAGS_BASE = -Wfloat-equal -Wpointer-arith -std=gnu99 -DLIBPOSTAL_DATA_DIR='"$(datadir)"'
CFLAGS_O0 = $(CFLAGS_BASE) -O0
CFLAGS_O1 = $(CFLAGS_BASE) -O1
CFLAGS_O2 = $(CFLAGS_BASE) -O2
CFLAGS_O3 = $(CFLAGS_BASE) -O3
-DEFAULT_INCLUDES = -I.. -I/usr/local/include
+DEFAULT_INCLUDES = -I..
CFLAGS = $(CFLAGS_BASE)
@@ -11,4 +11,4 @@ TESTS = test_libpostal
noinst_PROGRAMS = test_libpostal
test_libpostal_SOURCES = test.c test_expand.c test_parser.c test_transliterate.c test_numex.c test_trie.c test_string_utils.c test_crf_context.c
test_libpostal_LDADD = ../src/libpostal.la $(CBLAS_LIBS)
-test_libpostal_CFLAGS = $(CFLAGS_O3)
+test_libpostal_CFLAGS = $(CFLAGS_BASE)

View file

@ -0,0 +1,4 @@
%%PORTNAME%% requires model data (about 1.5GB) which can be downloaded using
the following command:
# %%PREFIX%%/bin/libpostal_data download all %%LIBPOSTAL_DATADIR%%

6
misc/libpostal/pkg-descr Normal file
View file

@ -0,0 +1,6 @@
C library for parsing/normalizing street addresses around the world, powered
by statistical NLP and open geo data. This library helps convert the
free-form addresses that humans use into clean normalized forms suitable for
machine comparison and full-text indexing.
WWW: https://github.com/openvenues/libpostal

9
misc/libpostal/pkg-plist Normal file
View file

@ -0,0 +1,9 @@
bin/address_parser
bin/libpostal
bin/libpostal_data
include/libpostal/libpostal.h
%%STATIC%%lib/libpostal.a
lib/libpostal.so
lib/libpostal.so.1
lib/libpostal.so.1.0.0
libdata/pkgconfig/libpostal.pc