lib/unicode.c

   1 /*
   2  * Copyright (c) 2009, 2010 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18
  19 #include "unicode.h"
  20
  21 #include <inttypes.h>
  22
  23 #include "dynamic-string.h"
  24 #include "util.h"
  25
  26 /* Returns the unicode code point corresponding to leading surrogate 'leading'
  27  * and trailing surrogate 'trailing'.  The return value will not make any
  28  * sense if 'leading' or 'trailing' are not in the correct ranges for leading
  29  * or trailing surrogates. */
  30 int
  31 utf16_decode_surrogate_pair(int leading, int trailing)
  32 {
  33     /*
  34      *  Leading surrogate:         110110wwwwxxxxxx
  35      * Trailing surrogate:         110111xxxxxxxxxx
  36      *         Code point: 000uuuuuxxxxxxxxxxxxxxxx
  37      */
  38     int w = (leading >> 6) & 0xf;
  39     int u = w + 1;
  40     int x0 = leading & 0x3f;
  41     int x1 = trailing & 0x3ff;
  42     return (u << 16) | (x0 << 10) | x1;
  43 }
  44
  45 /* Returns the number of Unicode characters in UTF-8 string 's'. */
  46 size_t
  47 utf8_length(const char *s_)
  48 {
  49     const uint8_t *s;
  50     size_t length;
  51
  52     length = 0;
  53     for (s = (const uint8_t *) s_; *s != '\0'; s++) {
  54         /* The most-significant bits of the first byte in a character are one
  55          * of 2#01, 2#00, or 2#11.  2#10 is a continuation byte. */
  56         length += (*s & 0xc0) != 0x80;
  57     }
  58     return length;
  59 }
  60
  61 static char *
  62 invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
  63 {
  64     struct ds msg;
  65     int i;
  66
  67     if (lengthp) {
  68         *lengthp = 0;
  69     }
  70
  71     ds_init(&msg);
  72     ds_put_cstr(&msg, "invalid UTF-8 sequence");
  73     for (i = 0; i < n; i++) {
  74         ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
  75     }
  76     return ds_steal_cstr(&msg);
  77 }
  78
  79 struct utf8_sequence {
  80     uint8_t octets[5][2];
  81 };
  82
  83 static const struct utf8_sequence *
  84 lookup_utf8_sequence(uint8_t c)
  85 {
  86     static const struct utf8_sequence seqs[] = {
  87         { { { 0x01, 0x7f },
  88             { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
  89
  90         { { { 0xc2, 0xdf }, { 0x80, 0xbf },
  91             { 0, 0 }, { 0, 0 }, { 0, 0 } } },
  92
  93         { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
  94             {0,0}, {0, 0 } } },
  95
  96         { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
  97             { 0, 0 }, { 0, 0 } } },
  98
  99         { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
 100             { 0, 0 }, { 0, 0 } } },
 101
 102         { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
 103             { 0, 0 }, { 0, 0 } } },
 104
 105         { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
 106             { 0, 0 } } },
 107
 108         { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
 109             { 0, 0 } } },
 110
 111         { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
 112             { 0, 0 } } },
 113     };
 114
 115     size_t i;
 116
 117     for (i = 0; i < ARRAY_SIZE(seqs); i++) {
 118         const uint8_t *o = seqs[i].octets[0];
 119         if (c >= o[0] && c <= o[1]) {
 120             return &seqs[i];
 121         }
 122     }
 123     return NULL;
 124 }
 125
 126 /* Checks that 's' is a valid, null-terminated UTF-8 string.  If so, returns a
 127  * null pointer and sets '*lengthp' to the number of Unicode characters in
 128  * 's'.  If not, returns an error message that the caller must free and sets
 129  * '*lengthp' to 0.
 130  *
 131  * 'lengthp' may be NULL if the length is not needed. */
 132 char *
 133 utf8_validate(const char *s_, size_t *lengthp)
 134 {
 135     size_t length = 0;
 136     const uint8_t *s;
 137
 138     for (s = (const uint8_t *) s_; *s != '\0'; ) {
 139         length++;
 140         if (s[0] < 0x80) {
 141             s++;
 142         } else {
 143             const struct utf8_sequence *seq;
 144             int i;
 145
 146             seq = lookup_utf8_sequence(s[0]);
 147             if (!seq) {
 148                 return invalid_utf8_sequence(s, 1, lengthp);
 149             }
 150
 151             for (i = 1; seq->octets[i][0]; i++) {
 152                 const uint8_t *o = seq->octets[i];
 153                 if (s[i] < o[0] || s[i] > o[1]) {
 154                     return invalid_utf8_sequence(s, i + 1, lengthp);
 155                 }
 156             }
 157             s += i;
 158         }
 159     }
 160     if (lengthp) {
 161         *lengthp = length;
 162     }
 163     return NULL;
 164 }