LCOV - code coverage report
Current view: top level - src - lutf8lib.c Coverage Total Hit
Test: Lua 5.3.6 Lines: 98.4 % 126 124
Test Date: 2024-04-28 10:23:15
Legend: Lines: hit not hit

            Line data    Source code
       1              : /*
       2              : ** $Id: lutf8lib.c,v 1.16.1.1 2017/04/19 17:29:57 roberto Exp $
       3              : ** Standard library for UTF-8 manipulation
       4              : ** See Copyright Notice in lua.h
       5              : */
       6              : 
       7              : #define lutf8lib_c
       8              : #define LUA_LIB
       9              : 
      10              : #include "lprefix.h"
      11              : 
      12              : 
      13              : #include <assert.h>
      14              : #include <limits.h>
      15              : #include <stdlib.h>
      16              : #include <string.h>
      17              : 
      18              : #include "lua.h"
      19              : 
      20              : #include "lauxlib.h"
      21              : #include "lualib.h"
      22              : 
      23              : #define MAXUNICODE      0x10FFFF
      24              : 
      25              : #define iscont(p)       ((*(p) & 0xC0) == 0x80)
      26              : 
      27              : 
      28              : /* from strlib */
      29              : /* translate a relative string position: negative means back from end */
      30           79 : static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
      31           79 :   if (pos >= 0) return pos;
      32           21 :   else if (0u - (size_t)pos > len) return 0;
      33           20 :   else return (lua_Integer)len + pos + 1;
      34              : }
      35              : 
      36              : 
      37              : /*
      38              : ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
      39              : */
      40           42 : static const char *utf8_decode (const char *o, int *val) {
      41              :   static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
      42           42 :   const unsigned char *s = (const unsigned char *)o;
      43           42 :   unsigned int c = s[0];
      44           42 :   unsigned int res = 0;  /* final result */
      45           42 :   if (c < 0x80)  /* ascii? */
      46           34 :     res = c;
      47              :   else {
      48            8 :     int count = 0;  /* to count number of continuation bytes */
      49           18 :     while (c & 0x40) {  /* still have continuation bytes? */
      50           13 :       int cc = s[++count];  /* read next byte */
      51           13 :       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
      52            3 :         return NULL;  /* invalid byte sequence */
      53           10 :       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
      54           10 :       c <<= 1;  /* to test next bit */
      55              :     }
      56            5 :     res |= ((c & 0x7F) << (count * 5));  /* add first byte */
      57            5 :     if (count > 3 || res > MAXUNICODE || res <= limits[count])
      58            0 :       return NULL;  /* invalid byte sequence */
      59            5 :     s += count;  /* skip continuation bytes read */
      60              :   }
      61           39 :   if (val) *val = res;
      62           39 :   return (const char *)s + 1;  /* +1 to include first byte */
      63              : }
      64              : 
      65              : 
      66              : /*
      67              : ** utf8len(s [, i [, j]]) --> number of characters that start in the
      68              : ** range [i,j], or nil + current position if 's' is not well formed in
      69              : ** that interval
      70              : */
      71           12 : static int utflen (lua_State *L) {
      72           12 :   int n = 0;
      73              :   size_t len;
      74           12 :   const char *s = luaL_checklstring(L, 1, &len);
      75           12 :   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
      76           12 :   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
      77           12 :   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
      78              :                    "initial position out of string");
      79           11 :   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
      80              :                    "final position out of string");
      81           31 :   while (posi <= posj) {
      82           21 :     const char *s1 = utf8_decode(s + posi, NULL);
      83           21 :     if (s1 == NULL) {  /* conversion error? */
      84            1 :       lua_pushnil(L);  /* return nil ... */
      85            1 :       lua_pushinteger(L, posi + 1);  /* ... and current position */
      86            1 :       return 2;
      87              :     }
      88           20 :     posi = s1 - s;
      89           20 :     n++;
      90              :   }
      91           10 :   lua_pushinteger(L, n);
      92           10 :   return 1;
      93              : }
      94              : 
      95              : 
      96              : /*
      97              : ** codepoint(s, [i, [j]])  -> returns codepoints for all characters
      98              : ** that start in the range [i,j]
      99              : */
     100            9 : static int codepoint (lua_State *L) {
     101              :   size_t len;
     102            9 :   const char *s = luaL_checklstring(L, 1, &len);
     103            9 :   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
     104            9 :   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
     105              :   int n;
     106              :   const char *se;
     107            9 :   luaL_argcheck(L, posi >= 1, 2, "out of range");
     108            9 :   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
     109            7 :   if (posi > pose) return 0;  /* empty interval; return no values */
     110            7 :   if (pose - posi >= INT_MAX)  /* (lua_Integer -> int) overflow? */
     111            0 :     return luaL_error(L, "string slice too long");
     112            7 :   n = (int)(pose -  posi) + 1;
     113            7 :   luaL_checkstack(L, n, "string slice too long");
     114            7 :   n = 0;
     115            7 :   se = s + pose;
     116           16 :   for (s += posi - 1; s < se;) {
     117              :     int code;
     118           10 :     s = utf8_decode(s, &code);
     119           10 :     if (s == NULL)
     120            1 :       return luaL_error(L, "invalid UTF-8 code");
     121            9 :     lua_pushinteger(L, code);
     122            9 :     n++;
     123              :   }
     124            6 :   return n;
     125              : }
     126              : 
     127              : 
     128           17 : static void pushutfchar (lua_State *L, int arg) {
     129           17 :   lua_Integer code = luaL_checkinteger(L, arg);
     130           16 :   luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
     131           14 :   lua_pushfstring(L, "%U", (long)code);
     132           14 : }
     133              : 
     134              : 
     135              : /*
     136              : ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
     137              : */
     138           14 : static int utfchar (lua_State *L) {
     139           14 :   int n = lua_gettop(L);  /* number of arguments */
     140           14 :   if (n == 1)  /* optimize common case of single char */
     141           10 :     pushutfchar(L, 1);
     142              :   else {
     143              :     int i;
     144              :     luaL_Buffer b;
     145            4 :     luaL_buffinit(L, &b);
     146            9 :     for (i = 1; i <= n; i++) {
     147            7 :       pushutfchar(L, i);
     148            5 :       luaL_addvalue(&b);
     149              :     }
     150            2 :     luaL_pushresult(&b);
     151              :   }
     152           11 :   return 1;
     153              : }
     154              : 
     155              : 
     156              : /*
     157              : ** offset(s, n, [i])  -> index where n-th character counting from
     158              : **   position 'i' starts; 0 means character at 'i'.
     159              : */
     160           37 : static int byteoffset (lua_State *L) {
     161              :   size_t len;
     162           37 :   const char *s = luaL_checklstring(L, 1, &len);
     163           37 :   lua_Integer n  = luaL_checkinteger(L, 2);
     164           37 :   lua_Integer posi = (n >= 0) ? 1 : len + 1;
     165           37 :   posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
     166           37 :   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
     167              :                    "position out of range");
     168           36 :   if (n == 0) {
     169              :     /* find beginning of current byte sequence */
     170            9 :     while (posi > 0 && iscont(s + posi)) posi--;
     171              :   }
     172              :   else {
     173           30 :     if (iscont(s + posi))
     174            1 :       return luaL_error(L, "initial position is a continuation byte");
     175           29 :     if (n < 0) {
     176           26 :        while (n < 0 && posi > 0) {  /* move back */
     177              :          do {  /* find beginning of previous character */
     178           27 :            posi--;
     179           27 :          } while (posi > 0 && iscont(s + posi));
     180           15 :          n++;
     181              :        }
     182              :      }
     183              :      else {
     184           18 :        n--;  /* do not move for 1st character */
     185           38 :        while (n > 0 && posi < (lua_Integer)len) {
     186              :          do {  /* find beginning of next character */
     187           36 :            posi++;
     188           36 :          } while (iscont(s + posi));  /* (cannot pass final '\0') */
     189           20 :          n--;
     190              :        }
     191              :      }
     192              :   }
     193           35 :   if (n == 0)  /* did it find given character? */
     194           26 :     lua_pushinteger(L, posi + 1);
     195              :   else  /* no such character */
     196            9 :     lua_pushnil(L);
     197           35 :   return 1;
     198              : }
     199              : 
     200              : 
     201           13 : static int iter_aux (lua_State *L) {
     202              :   size_t len;
     203           13 :   const char *s = luaL_checklstring(L, 1, &len);
     204           13 :   lua_Integer n = lua_tointeger(L, 2) - 1;
     205           13 :   if (n < 0)  /* first iteration? */
     206            3 :     n = 0;  /* start from here */
     207           10 :   else if (n < (lua_Integer)len) {
     208           10 :     n++;  /* skip current byte */
     209           12 :     while (iscont(s + n)) n++;  /* and its continuations */
     210              :   }
     211           13 :   if (n >= (lua_Integer)len)
     212            2 :     return 0;  /* no more codepoints */
     213              :   else {
     214              :     int code;
     215           11 :     const char *next = utf8_decode(s + n, &code);
     216           11 :     if (next == NULL || iscont(next))
     217            1 :       return luaL_error(L, "invalid UTF-8 code");
     218           10 :     lua_pushinteger(L, n + 1);
     219           10 :     lua_pushinteger(L, code);
     220           10 :     return 2;
     221              :   }
     222              : }
     223              : 
     224              : 
     225            5 : static int iter_codes (lua_State *L) {
     226            5 :   luaL_checkstring(L, 1);
     227            3 :   lua_pushcfunction(L, iter_aux);
     228            3 :   lua_pushvalue(L, 1);
     229            3 :   lua_pushinteger(L, 0);
     230            3 :   return 3;
     231              : }
     232              : 
     233              : 
     234              : /* pattern to match a single UTF-8 character */
     235              : #define UTF8PATT        "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
     236              : 
     237              : 
     238              : static const luaL_Reg funcs[] = {
     239              :   {"offset", byteoffset},
     240              :   {"codepoint", codepoint},
     241              :   {"char", utfchar},
     242              :   {"len", utflen},
     243              :   {"codes", iter_codes},
     244              :   /* placeholders */
     245              :   {"charpattern", NULL},
     246              :   {NULL, NULL}
     247              : };
     248              : 
     249              : 
     250           86 : LUAMOD_API int luaopen_utf8 (lua_State *L) {
     251           86 :   luaL_newlib(L, funcs);
     252           86 :   lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1);
     253           86 :   lua_setfield(L, -2, "charpattern");
     254           86 :   return 1;
     255              : }
     256              : 
        

Generated by: LCOV version 2.0-1