1 /* analyse.c: Word suffix analysis.
3 Copyright (C) 2020 Michael Zucchi
5 This program is free software: you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see
17 <http://www.gnu.org/licenses/>.
36 #define NODE(x) { .value = x }
38 static ez_set collapse_set;
39 static struct wchar_node collapse_nodes[] = {
40 // All the quote types
69 static unsigned int wchar_hash(const void *n) {
70 //return ez_hash_int32(((struct wchar_node *)n)->value);
71 //return ((struct wchar_node *)n)->value;
72 return ((struct wchar_node *)n)->value * 378684 >> 16;
75 static int wchar_equals(const void *a, const void *b) {
76 return ((struct wchar_node *)a)->value == ((struct wchar_node *)b)->value;
79 static void done(void) __attribute__ ((destructor));
80 static void done(void) {
81 ez_set_clear(&collapse_set);
84 static void init(void) __attribute__ ((constructor));
85 static void init(void) {
86 ez_set_init(&collapse_set, wchar_hash, wchar_equals, NULL);
87 for (int i=0;i<sizeof(collapse_nodes)/sizeof(collapse_nodes[0]);i++)
88 ez_set_put(&collapse_set, &collapse_nodes[i]);
90 // exhaustive search for perfect hash
91 printf("find best\n");
95 for (int k=0;k<25;k++) {
96 for (int j=1;j<1000000;j++) {
98 char hits[32] = { 0 };
100 for (int i=0;i<sizeof(collapse_nodes)/sizeof(collapse_nodes[0]);i++) {
101 int h = ((collapse_nodes[i].value * j) >> k) & 31;
102 //h = wchar_hash(&collapse_nodes[i]) & 31;
109 printf("best c=%d j=%d k=%d\n", c, j, k);
115 printf("best c=%d j=%d k=%d\n", bestc, bestj, bestk);
119 printf("best c=%d j=%d k=%d\n", bestc, bestj, bestk);
123 static int iswcollapse(wchar_t c) {
124 struct wchar_node key = { .value = c };
126 return ez_set_get(&collapse_set, &key) != NULL;
129 Want this stuff pre-defined really
132 int analyse_words(ez_list *list, int suffix, const char *words) {
133 size_t len = strlen(words);
134 char word[len+1]; // + ??
135 wchar_t lwords[len+1];
137 const char *t = words;
138 mbstate_t state = { 0 };
140 len = mbsrtowcs(lwords, &t, len+1, &state);
141 if (len == (size_t)-1) {
142 fprintf(stderr, "'%s' @ '%s'", words, t);
144 for (int i=0;i<strlen(words);i++)
145 fprintf(stderr, " %02x", words[i] & 0xff);
146 fprintf(stderr, "\n");
150 //printf("%ls\n", lwords);
158 if (iswcollapse(c)) {
160 } else if (iswgraph(c) && !iswpunct(c)) {
167 // TODO: could keep track of start of each multi-byte char and just write those out
170 const wchar_t *t = s++;
171 mbstate_t state = { 0 };
173 len = wcsrtombs(word, &t, sizeof(word), &state);
174 if (len < sizeof(word)) {
175 struct string_node *string = malloc(sizeof(*string) + len + 1);
177 strcpy(string->value, word);
178 ez_list_addtail(list, string);
184 fprintf(stderr, "overflow %s\n", words);
194 void analyse_free(ez_list *list) {
195 struct string_node *w;
197 while ((w = ez_list_remhead(list)))
201 int main(int argc, char **argv) {
202 ez_list list = EZ_INIT_LIST(list);
204 analyse_words(&list, 0, "this, is a word? Foo-bar O'callahan");
205 analyse_words(&list, 1, "this, is a word? Foo-bar O'callahan");