utf8expr.c (2190B)
1 /* 2 * Copy me if you can. 3 * by 20h 4 */ 5 6 #include <unistd.h> 7 #include <string.h> 8 #include <stdlib.h> 9 #include <stdio.h> 10 #include <libgen.h> 11 12 #include "arg.h" 13 14 char *argv0; 15 16 /* 17 * Idea taken from: 18 * http://canonical.org/~kragen/strlen-utf8.html 19 */ 20 size_t 21 utf8strlen(char *s) 22 { 23 size_t i; 24 25 i = 0; 26 for (; s[0]; s++) { 27 if ((s[0] & 0xc0) != 0x80) 28 i++; 29 } 30 31 return i; 32 } 33 34 char * 35 utf8strchr(char *s, char *c) 36 { 37 size_t j, cl; 38 39 cl = strlen(c); 40 if (cl == 0) 41 return NULL; 42 43 for (j = 0; ; s++) { 44 if (j > 6) 45 return NULL; 46 j++; 47 48 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { 49 if (cl == j) { 50 if (!memcmp(&s[-j], c, cl)) 51 return &s[-j]; 52 } 53 j = 0; 54 55 if (s[0] == '\0') 56 break; 57 } 58 } 59 60 return NULL; 61 } 62 63 char * 64 utf8substr(char *s, size_t pos, size_t *length) 65 { 66 size_t i, j, rl; 67 char *ret; 68 69 if (*length < 1) 70 return NULL; 71 72 ret = NULL; 73 rl = 0; 74 for (i = 0, j = 0; *length > 0; s++) { 75 if (j > 6) 76 return NULL; 77 j++; 78 79 if (ret != NULL) 80 rl++; 81 82 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { 83 if (i >= pos) { 84 if (ret == NULL) { 85 ret = &s[-j]; 86 rl = j; 87 } 88 (*length)--; 89 } 90 i++; 91 j = 0; 92 93 if (s[0] == '\0') 94 break; 95 } 96 } 97 98 *length = rl; 99 return ret; 100 } 101 102 size_t 103 utf8index(char *s, char *chars) 104 { 105 size_t i, j; 106 char c[7]; 107 108 j = 0; 109 for (i = 0; ; s++) { 110 if (j > 6) 111 return 0; 112 j++; 113 114 if ((s[0] & 0xc0) != 0x80 || s[0] == '\0') { 115 memset(c, 0, sizeof(c)); 116 memmove(c, &s[-j], j); 117 if (utf8strchr(chars, c)) 118 return i; 119 i++; 120 j = 0; 121 122 if (s[0] == '\0') 123 break; 124 } 125 } 126 127 return 0; 128 } 129 130 void 131 usage(void) 132 { 133 fprintf(stderr, "usage: %s [substr|index|length] str [args ...]\n", 134 basename(argv0)); 135 exit(1); 136 } 137 138 int 139 main(int argc, char *argv[]) 140 { 141 char *s; 142 size_t len; 143 144 argv0 = argv[0]; 145 146 if (argc < 3) 147 usage(); 148 149 switch(argv[1][0]) { 150 case 'i': 151 if (argc < 4) 152 usage(); 153 printf("%ld\n", utf8index(argv[2], argv[3])); 154 break; 155 case 'l': 156 printf("%ld\n", utf8strlen(argv[2])); 157 break; 158 case 's': 159 if (argc < 5) 160 usage(); 161 len = atoi(argv[4]); 162 s = utf8substr(argv[2], atoi(argv[3]), &len); 163 if (s == NULL) 164 return -1; 165 printf("%.*s\n", (int)len, s); 166 break; 167 default: 168 usage(); 169 }; 170 171 return 0; 172 } 173