mirror of
https://gitdl.cn/https://github.com/chakralinux/desktop.git
synced 2025-02-03 17:47:14 +08:00
635 lines
21 KiB
Diff
635 lines
21 KiB
Diff
diff -ruN libtextcat-2.2.part1/src/constants.h libtextcat-2.2/src/constants.h
|
|
--- libtextcat-2.2.part1/src/constants.h 2007-07-25 10:46:49.000000000 +0100
|
|
+++ libtextcat-2.2/src/constants.h 2007-07-25 10:47:25.000000000 +0100
|
|
@@ -39,6 +39,8 @@
|
|
*/
|
|
#include <limits.h>
|
|
|
|
+#define _UTF8_
|
|
+
|
|
#define DESCRIPTION "out of place"
|
|
|
|
/* Reported matches are those fingerprints with a score less than best
|
|
@@ -59,14 +61,21 @@
|
|
/* Maximum number of n-grams in a fingerprint */
|
|
#define MAXNGRAMS 400
|
|
|
|
-/* Maximum size of an n-gram? */
|
|
-#define MAXNGRAMSIZE 5
|
|
+/* Maximum number of character of an n-gram? */
|
|
+#define MAXNGRAMSYMBOL 5
|
|
+
|
|
+/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
|
|
+#ifdef _UTF8_
|
|
+#define MAXNGRAMSIZE 20
|
|
+#else
|
|
+#define MAXNGRAMSIZE MAXNGRAMSYMBOL
|
|
+#endif
|
|
|
|
/* Which characters are not acceptable in n-grams? */
|
|
#define INVALID(c) (isspace((int)c) || isdigit((int)c))
|
|
|
|
/* Minimum size (in characters) for accepting a document */
|
|
-#define MINDOCSIZE 25
|
|
+#define MINDOCSIZE 6
|
|
|
|
/* Maximum penalty for missing an n-gram in fingerprint */
|
|
#define MAXOUTOFPLACE 400
|
|
@@ -76,4 +85,7 @@
|
|
|
|
#define MAXSCORE INT_MAX
|
|
|
|
+/* where the fingerprints files are stored */
|
|
+#define DEFAULT_FINGERPRINTS_PATH ""
|
|
+
|
|
#endif
|
|
diff -ruN libtextcat-2.2.part1/src/fingerprint.c libtextcat-2.2/src/fingerprint.c
|
|
--- libtextcat-2.2.part1/src/fingerprint.c 2007-07-25 10:46:49.000000000 +0100
|
|
+++ libtextcat-2.2/src/fingerprint.c 2007-07-25 10:47:25.000000000 +0100
|
|
@@ -63,6 +63,10 @@
|
|
* - put table/heap datastructure in a separate file.
|
|
*/
|
|
|
|
+#ifndef _UTF8_
|
|
+#define _UTF8_
|
|
+#endif
|
|
+
|
|
#include "config.h"
|
|
#include <stdio.h>
|
|
#ifdef HAVE_STDLIB_H
|
|
@@ -80,10 +84,12 @@
|
|
#include "wg_mempool.h"
|
|
#include "constants.h"
|
|
|
|
+#include "utf8misc.h"
|
|
|
|
#define TABLESIZE (1<<TABLEPOW)
|
|
#define TABLEMASK ((TABLESIZE)-1)
|
|
|
|
+
|
|
typedef struct {
|
|
|
|
sint2 rank;
|
|
@@ -134,29 +140,14 @@
|
|
}
|
|
|
|
|
|
-/* checks if n-gram lex is a prefix of key and of length len */
|
|
-inline int issame( char *lex, char *key, int len )
|
|
-{
|
|
- int i;
|
|
- for (i=0; i<len; i++) {
|
|
- if ( key[i] != lex[i] ) {
|
|
- return 0;
|
|
- }
|
|
- }
|
|
- if ( lex[i] != 0 ) {
|
|
- return 0;
|
|
- }
|
|
- return 1;
|
|
-}
|
|
-
|
|
|
|
/* increases frequency of ngram(p,len) */
|
|
-static inline int increasefreq( table_t *t, char *p, int len )
|
|
-{
|
|
- uint4 hash = simplehash( p, len ) & TABLEMASK;
|
|
+static int increasefreq( table_t *t, char *p, int len )
|
|
+{
|
|
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
|
|
entry_t *entry = t->table[ hash ];
|
|
-
|
|
- while ( entry ) {
|
|
+
|
|
+ while ( entry ) {
|
|
if ( issame( entry->str, p, len ) ) {
|
|
/*** Found it! ***/
|
|
entry->cnt++;
|
|
@@ -168,7 +159,7 @@
|
|
}
|
|
|
|
/*** Not found, so create ***/
|
|
- entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
|
|
+ entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
|
|
strcpy( entry->str, p );
|
|
entry->cnt = 1;
|
|
|
|
@@ -181,12 +172,12 @@
|
|
#if 0
|
|
|
|
/* looks up ngram(p,len) */
|
|
-static entry_t *findfreq( table_t *t, char *p, int len )
|
|
-{
|
|
- uint4 hash = simplehash( p, len ) & TABLEMASK;
|
|
+static entry_t *findfreq( table_t *t, char *p, int len )
|
|
+{
|
|
+ uint4 hash = simplehash( p, len ) & TABLEMASK;
|
|
entry_t *entry = t->table[ hash ];
|
|
-
|
|
- while ( entry ) {
|
|
+
|
|
+ while ( entry ) {
|
|
if ( issame( entry->str, p, len ) ) {
|
|
return entry;
|
|
}
|
|
@@ -219,7 +210,7 @@
|
|
#define GREATER(x,y) ((x).cnt > (y).cnt)
|
|
#define LESS(x,y) ((x).cnt < (y).cnt)
|
|
|
|
-inline static void siftup( table_t *t, unsigned int child )
|
|
+static void siftup( table_t *t, unsigned int child )
|
|
{
|
|
entry_t *heap = t->heap;
|
|
unsigned int parent = (child-1) >> 1;
|
|
@@ -241,7 +232,7 @@
|
|
}
|
|
|
|
|
|
-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
|
|
+static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
|
|
{
|
|
entry_t *heap = t->heap;
|
|
unsigned int child = parent*2 + 1;
|
|
@@ -458,21 +449,27 @@
|
|
return dest;
|
|
}
|
|
|
|
-
|
|
+/**
|
|
+* this function extract all n-gram from past buffer and put them into the table "t"
|
|
+* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
|
|
+*/
|
|
static void createngramtable( table_t *t, const char *buf )
|
|
{
|
|
char n[MAXNGRAMSIZE+1];
|
|
const char *p = buf;
|
|
int i;
|
|
+ int pointer = 0;
|
|
|
|
/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
|
|
- for (;;p++) {
|
|
+ while(1) {
|
|
|
|
- const char *q = p;
|
|
+ const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
|
|
char *m = n;
|
|
|
|
/*** First char may be an underscore ***/
|
|
- *m++ = *q++;
|
|
+ int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
|
|
+ q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
|
|
+ m += decay; /*[modified]*/
|
|
*m = '\0';
|
|
|
|
increasefreq( t, n, 1 );
|
|
@@ -482,19 +479,22 @@
|
|
}
|
|
|
|
/*** Let the compiler unroll this ***/
|
|
- for ( i=2; i<=MAXNGRAMSIZE; i++) {
|
|
+ for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
|
|
|
|
- *m++ = *q;
|
|
+ decay = charcopy(q, m); /*[modified] like above*/
|
|
+ m += decay;
|
|
*m = '\0';
|
|
|
|
increasefreq( t, n, i );
|
|
|
|
if ( *q == '_' ) break;
|
|
- q++;
|
|
+ q += decay;
|
|
if ( *q == '\0' ) {
|
|
return;
|
|
}
|
|
}
|
|
+
|
|
+ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
|
|
}
|
|
return;
|
|
}
|
|
diff -ruN libtextcat-2.2.part1/src/fingerprint.h.orig libtextcat-2.2/src/fingerprint.h.orig
|
|
--- libtextcat-2.2.part1/src/fingerprint.h.orig 1970-01-01 01:00:00.000000000 +0100
|
|
+++ libtextcat-2.2/src/fingerprint.h.orig 2007-07-25 10:47:22.000000000 +0100
|
|
@@ -0,0 +1,55 @@
|
|
+#ifndef _FINGERPRINT_H_
|
|
+#define _FINGERPRINT_H_
|
|
+/*
|
|
+ * Copyright (C) 2003 WiseGuys Internet B.V.
|
|
+ *
|
|
+ * THE BSD LICENSE
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ *
|
|
+ * - Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ *
|
|
+ * - Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the
|
|
+ * distribution.
|
|
+ *
|
|
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
|
|
+ * its contributors may be used to endorse or promote products derived
|
|
+ * from this software without specific prior written permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+ */
|
|
+#include "common.h"
|
|
+
|
|
+#ifdef __cplusplus
|
|
+extern "C" {
|
|
+#endif
|
|
+
|
|
+extern void *fp_Init(const char *name);
|
|
+extern void fp_Done( void *handle );
|
|
+extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams );
|
|
+extern int fp_Read( void *handle, const char *fname, int maxngrams );
|
|
+extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
|
|
+extern void fp_Show( void *handle );
|
|
+extern const char *fp_Name( void *handle );
|
|
+extern void fp_Print( void *handle, FILE *fp );
|
|
+
|
|
+#ifdef __cplusplus
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif
|
|
diff -ruN libtextcat-2.2.part1/src/textcat.c libtextcat-2.2/src/textcat.c
|
|
--- libtextcat-2.2.part1/src/textcat.c 2007-07-25 10:46:49.000000000 +0100
|
|
+++ libtextcat-2.2/src/textcat.c 2007-07-25 10:47:25.000000000 +0100
|
|
@@ -74,6 +74,7 @@
|
|
typedef struct {
|
|
|
|
void **fprint;
|
|
+ char *fprint_disable;
|
|
uint4 size;
|
|
uint4 maxsize;
|
|
|
|
@@ -112,11 +113,21 @@
|
|
fp_Done( h->fprint[i] );
|
|
}
|
|
wg_free( h->fprint );
|
|
+ wg_free( h->fprint_disable );
|
|
wg_free( h );
|
|
|
|
}
|
|
|
|
-extern void *textcat_Init( const char *conffile )
|
|
+/** Replaces older function */
|
|
+extern void *textcat_Init( const char *conffile ){
|
|
+ return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
|
|
+}
|
|
+
|
|
+/**
|
|
+ * Originaly this function had only one parameter (conffile) it has been modified since OOo use
|
|
+ * Basicaly prefix is the directory path where fingerprints are stored
|
|
+ */
|
|
+extern void *special_textcat_Init( const char *conffile, const char *prefix )
|
|
{
|
|
textcat_t *h;
|
|
char line[1024];
|
|
@@ -134,11 +145,13 @@
|
|
h->size = 0;
|
|
h->maxsize = 16;
|
|
h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
|
|
+ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
|
|
|
|
while ( wg_getline( line, 1024, fp ) ) {
|
|
char *p;
|
|
char *segment[4];
|
|
- int res;
|
|
+ char finger_print_file_name[512];
|
|
+ int res;
|
|
|
|
/*** Skip comments ***/
|
|
#ifdef HAVE_STRCHR
|
|
@@ -156,17 +169,23 @@
|
|
/*** Ensure enough space ***/
|
|
if ( h->size == h->maxsize ) {
|
|
h->maxsize *= 2;
|
|
- h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
|
|
+ h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
|
|
+ h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
|
|
}
|
|
|
|
/*** Load data ***/
|
|
if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
|
|
goto ERROR;
|
|
}
|
|
- if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
|
|
+ finger_print_file_name[0] = '\0';
|
|
+ strcat(finger_print_file_name, prefix);
|
|
+ strcat(finger_print_file_name, segment[0]);
|
|
+
|
|
+ if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
|
|
textcat_Done(h);
|
|
goto ERROR;
|
|
- }
|
|
+ }
|
|
+ h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
|
|
h->size++;
|
|
}
|
|
|
|
@@ -203,11 +222,18 @@
|
|
result = _TEXTCAT_RESULT_SHORT;
|
|
goto READY;
|
|
}
|
|
-
|
|
+
|
|
/*** Calculate the score for each category. ***/
|
|
for (i=0; i<h->size; i++) {
|
|
- int score = fp_Compare( h->fprint[i], unknown, threshold );
|
|
- candidates[i].score = score;
|
|
+ int score;
|
|
+ if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
|
|
+ score = MAXSCORE;
|
|
+ }
|
|
+ else{
|
|
+ score = fp_Compare( h->fprint[i], unknown, threshold );
|
|
+ /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
|
|
+ }
|
|
+ candidates[i].score = score;
|
|
candidates[i].name = fp_Name( h->fprint[i] );
|
|
if ( score < minscore ) {
|
|
minscore = score;
|
|
diff -ruN libtextcat-2.2.part1/src/textcat.h libtextcat-2.2/src/textcat.h
|
|
--- libtextcat-2.2.part1/src/textcat.h 2007-07-25 10:46:49.000000000 +0100
|
|
+++ libtextcat-2.2/src/textcat.h 2007-07-25 10:48:18.000000000 +0100
|
|
@@ -55,10 +54,19 @@
|
|
* Returns: handle on success, NULL on error. (At the moment, the
|
|
* only way errors can occur, is when the library cannot read the
|
|
* conffile, or one of the fingerprint files listed in it.)
|
|
+ *
|
|
+ * Replace older function (and has exacly the same behaviour)
|
|
+ * see below
|
|
*/
|
|
extern void *textcat_Init( const char *conffile );
|
|
|
|
/**
|
|
+ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
|
|
+ * Basicaly prefix is the directory path where fingerprints are stored
|
|
+ */
|
|
+extern void *special_textcat_Init( const char *conffile, const char *prefix );
|
|
+
|
|
+/**
|
|
* textcat_Done() - Free up resources for handle
|
|
*/
|
|
extern void textcat_Done( void *handle );
|
|
diff -ruN libtextcat-2.2.part1/src/utf8misc.c libtextcat-2.2/src/utf8misc.c
|
|
--- libtextcat-2.2.part1/src/utf8misc.c 1970-01-01 01:00:00.000000000 +0100
|
|
+++ libtextcat-2.2/src/utf8misc.c 2007-07-25 10:48:57.000000000 +0100
|
|
@@ -0,0 +1,132 @@
|
|
+/***************************************************************************
|
|
+ * Copyright (C) 2006 by Jocelyn Merand *
|
|
+ * joc.mer@gmail.com *
|
|
+ * *
|
|
+ * THE BSD LICENSE
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ *
|
|
+ * - Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ *
|
|
+ * - Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the
|
|
+ * distribution.
|
|
+ *
|
|
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
|
|
+ * its contributors may be used to endorse or promote products derived
|
|
+ * from this software without specific prior written permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+ ***************************************************************************/
|
|
+
|
|
+#ifndef _UTF8_MISC_H_
|
|
+#include "utf8misc.h"
|
|
+#endif
|
|
+
|
|
+
|
|
+int nextcharstart(const char *str, int position){
|
|
+ int pointer = position;
|
|
+
|
|
+ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then str[pointer] is an escape character*/
|
|
+
|
|
+ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
|
|
+ ++pointer;
|
|
+ }
|
|
+ return pointer;
|
|
+}
|
|
+
|
|
+
|
|
+int charcopy(const char *str, char *dest){
|
|
+
|
|
+ int pointer = 0;
|
|
+ if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then str[pointer] is an escape character*/
|
|
+
|
|
+ char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
|
|
+ dest[pointer] = str[pointer];
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ if(str[pointer]){
|
|
+ dest[pointer] = str[pointer];
|
|
+ ++pointer;
|
|
+ }
|
|
+
|
|
+ return pointer;
|
|
+}
|
|
+
|
|
+
|
|
+int issame( char *lex, char *key, int len )
|
|
+{
|
|
+ /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
|
|
+ int char_counter = 0;
|
|
+ int pointer = 0;
|
|
+ while(char_counter < len) {
|
|
+
|
|
+ if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
|
|
+
|
|
+ /*then key[pointer] is an escap character*/
|
|
+
|
|
+ char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
|
|
+
|
|
+ while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
|
|
+ escape_char = escape_char <<1;
|
|
+ ++pointer;
|
|
+ }
|
|
+ }
|
|
+ ++char_counter; /*and we are on a new utf8 character*/
|
|
+ if ( key[pointer] != lex[pointer] ) {
|
|
+ return 0;
|
|
+ /*printf(" NO\n", lex, key, len);*/
|
|
+ }
|
|
+ ++pointer;
|
|
+ }
|
|
+ if ( lex[pointer] != '\0' ) {
|
|
+ return 0;
|
|
+ /*printf(" NO\n");*/
|
|
+ }
|
|
+
|
|
+ /*printf(" YES\n");*/
|
|
+
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+
|
|
+extern int utfstrlen(const char* str){
|
|
+ int char_counter = 0;
|
|
+ int pointer = 0;
|
|
+ while(str[pointer]) {
|
|
+ pointer = nextcharstart(str, pointer);
|
|
+
|
|
+ ++char_counter; /*and we are on a new utf8 character*/
|
|
+ }
|
|
+ return char_counter;
|
|
+}
|
|
+
|
|
diff -ruN libtextcat-2.2.part1/src/utf8misc.h libtextcat-2.2/src/utf8misc.h
|
|
--- libtextcat-2.2.part1/src/utf8misc.h 1970-01-01 01:00:00.000000000 +0100
|
|
+++ libtextcat-2.2/src/utf8misc.h 2007-07-25 10:48:57.000000000 +0100
|
|
@@ -0,0 +1,88 @@
|
|
+/***************************************************************************
|
|
+ * Copyright (C) 2006 by Jocelyn Merand *
|
|
+ * joc.mer@gmail.com *
|
|
+ * *
|
|
+ * THE BSD LICENSE
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ *
|
|
+ * - Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ *
|
|
+ * - Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the
|
|
+ * distribution.
|
|
+ *
|
|
+ * - Neither the name of the WiseGuys Internet B.V. nor the names of
|
|
+ * its contributors may be used to endorse or promote products derived
|
|
+ * from this software without specific prior written permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
+ ***************************************************************************/
|
|
+
|
|
+#ifndef _UTF8_MISC_H_
|
|
+#define _UTF8_MISC_H_
|
|
+
|
|
+/**
|
|
+ * These variables are used in character processing functions
|
|
+ * These have been added to manage utf-8 symbols, particularly escape chars
|
|
+ */
|
|
+#ifdef _UTF8_
|
|
+#define ESCAPE_MASK 0x80
|
|
+#define WEIGHT_MASK 0xF0
|
|
+#else
|
|
+#define ESCAPE_MASK 0xFF
|
|
+#define WEIGHT_MASK 0x00
|
|
+#endif
|
|
+
|
|
+
|
|
+/*
|
|
+ * Is used to jump to the next start of char
|
|
+ * of course it's only usefull when encoding is utf-8
|
|
+ * This function have been added by Jocelyn Merand to use libtextcat in OOo
|
|
+ */
|
|
+int nextcharstart(const char *str, int position);
|
|
+
|
|
+
|
|
+/*Copy the char in str to dest
|
|
+ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
|
|
+ * return the number of char jumped
|
|
+ * This function have been added by Jocelyn Merand to use libtextcat in OOo
|
|
+ */
|
|
+int charcopy(const char *str, char *dest);
|
|
+
|
|
+
|
|
+/* checks if n-gram lex is a prefix of key and of length len
|
|
+* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
|
|
+* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
|
|
+*/
|
|
+int issame( char *lex, char *key, int len );
|
|
+
|
|
+
|
|
+/* Counts the number of characters
|
|
+* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
|
|
+* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
|
|
+*/
|
|
+#ifdef __cplusplus
|
|
+extern "C" {
|
|
+#endif
|
|
+extern int utfstrlen(const char* str);
|
|
+#ifdef __cplusplus
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif
|
|
+
|
|
--- libtextcat-2.2.part2/src/Makefile.am 2007-07-25 10:55:02.000000000 +0100
|
|
+++ libtextcat-2.2/src/Makefile.am 2007-07-25 10:55:52.000000000 +0100
|
|
@@ -12,11 +12,11 @@
|
|
|
|
libtextcat_includedir = $(includedir)/libtextcat
|
|
libtextcat_include_HEADERS = \
|
|
- common.h constants.h fingerprint.h textcat.h
|
|
+ common.h constants.h fingerprint.h textcat.h utf8misc.h
|
|
|
|
lib_LTLIBRARIES = libtextcat.la
|
|
libtextcat_la_SOURCES = \
|
|
- common.c fingerprint.c textcat.c wg_mempool.c
|
|
+ common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
|
|
|
|
bin_PROGRAMS = createfp
|
|
createfp_SOURCES = createfp.c
|