mirror of
https://git.freebsd.org/ports.git
synced 2025-06-06 21:30:31 -04:00
part). These patches, released under a BSD license, seem to improve the accuracy of language detection, especially those that don't have a Latin script.
135 lines
4.8 KiB
C
135 lines
4.8 KiB
C
--- /dev/null Thu Aug 23 22:58:13 2007
|
|
+++ src/utf8misc.c Thu Aug 23 22:47:07 2007
|
|
@@ -0,0 +1,132 @@
|
|
+/***************************************************************************
|
|
+ * Copyright (C) 2006 by Jocelyn Merand *
|
|
+ * joc.mer@gmail.com *
|
|
+ * *
|
|
+ * THE BSD LICENSE
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ *
|
|
+ * - Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ *
|
|
+ * - Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the
|
|
+ * distribution.
|
|
+ *
|
|
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
|
|
+ * its contributors may be used to endorse or promote products derived
|
|
+ * from this software without specific prior written permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+ ***************************************************************************/
|
|
+
|
|
+#ifndef _UTF8_MISC_H_
|
|
+#include "utf8misc.h"
|
|
+#endif
|
|
+
|
|
+
|
|
+int nextcharstart(const char *str, int position){
|
|
+ int pointer = position;
|
|
+
|
|
+ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then str[pointer] is an escape character*/
|
|
+
|
|
+ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
|
|
+ ++pointer;
|
|
+ }
|
|
+ return pointer;
|
|
+}
|
|
+
|
|
+
|
|
+int charcopy(const char *str, char *dest){
|
|
+
|
|
+ int pointer = 0;
|
|
+ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then str[pointer] is an escape character*/
|
|
+
|
|
+ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
|
|
+ dest[pointer] = str[pointer];
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ if(str[pointer]){
|
|
+ dest[pointer] = str[pointer];
|
|
+ ++pointer;
|
|
+ }
|
|
+
|
|
+ return pointer;
|
|
+}
|
|
+
|
|
+
|
|
+int issame( char *lex, char *key, int len )
|
|
+{
|
|
+ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
|
|
+ int char_counter = 0;
|
|
+ int pointer = 0;
|
|
+ while(char_counter < len) {
|
|
+
|
|
+ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then key[pointer] is an escap character*/
|
|
+
|
|
+ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ ++char_counter; /*and we are on a new utf8 character*/
|
|
+ if ( key[pointer] != lex[pointer] ) {
|
|
+ return 0;
|
|
+ /*printf(" NO\n", lex, key, len);*/
|
|
+ }
|
|
+ ++pointer;
|
|
+ }
|
|
+ if ( lex[pointer] != '\0' ) {
|
|
+ return 0;
|
|
+ /*printf(" NO\n");*/
|
|
+ }
|
|
+
|
|
+ /*printf(" YES\n");*/
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+
|
|
+extern int utfstrlen(const char* str){
|
|
+ int char_counter = 0;
|
|
+ int pointer = 0;
|
|
+ while(str[pointer]) {
|
|
+ pointer = nextcharstart(str, pointer);
|
|
+
|
|
+ ++char_counter; /*and we are on a new utf8 character*/
|
|
+ }
|
|
+ return char_counter;
|
|
+}
|
|
+
|