Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * unicode.c
0003  *
0004  * PURPOSE
0005  *  Routines for converting between UTF-8 and OSTA Compressed Unicode.
0006  *      Also handles filename mangling
0007  *
0008  * DESCRIPTION
0009  *  OSTA Compressed Unicode is explained in the OSTA UDF specification.
0010  *      http://www.osta.org/
0011  *  UTF-8 is explained in the IETF RFC XXXX.
0012  *      ftp://ftp.internic.net/rfc/rfcxxxx.txt
0013  *
0014  * COPYRIGHT
0015  *  This file is distributed under the terms of the GNU General Public
0016  *  License (GPL). Copies of the GPL can be obtained from:
0017  *      ftp://prep.ai.mit.edu/pub/gnu/GPL
0018  *  Each contributing author retains all rights to their own work.
0019  */
0020 
0021 #include "udfdecl.h"
0022 
0023 #include <linux/kernel.h>
0024 #include <linux/string.h>   /* for memset */
0025 #include <linux/nls.h>
0026 #include <linux/crc-itu-t.h>
0027 #include <linux/slab.h>
0028 
0029 #include "udf_sb.h"
0030 
0031 #define PLANE_SIZE 0x10000
0032 #define UNICODE_MAX 0x10ffff
0033 #define SURROGATE_MASK 0xfffff800
0034 #define SURROGATE_PAIR 0x0000d800
0035 #define SURROGATE_LOW  0x00000400
0036 #define SURROGATE_CHAR_BITS 10
0037 #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
0038 
0039 #define ILLEGAL_CHAR_MARK   '_'
0040 #define EXT_MARK        '.'
0041 #define CRC_MARK        '#'
0042 #define EXT_SIZE        5
0043 /* Number of chars we need to store generated CRC to make filename unique */
0044 #define CRC_LEN         5
0045 
0046 static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
0047                 int str_i_idx, int u_ch, unicode_t *ret)
0048 {
0049     unicode_t c;
0050     int start_idx = str_i_idx;
0051 
0052     /* Expand OSTA compressed Unicode to Unicode */
0053     c = str_i[str_i_idx++];
0054     if (u_ch > 1)
0055         c = (c << 8) | str_i[str_i_idx++];
0056     if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
0057         unicode_t next;
0058 
0059         /* Trailing surrogate char */
0060         if (str_i_idx >= str_i_max_len) {
0061             c = UNICODE_MAX + 1;
0062             goto out;
0063         }
0064 
0065         /* Low surrogate must follow the high one... */
0066         if (c & SURROGATE_LOW) {
0067             c = UNICODE_MAX + 1;
0068             goto out;
0069         }
0070 
0071         WARN_ON_ONCE(u_ch != 2);
0072         next = str_i[str_i_idx++] << 8;
0073         next |= str_i[str_i_idx++];
0074         if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
0075             !(next & SURROGATE_LOW)) {
0076             c = UNICODE_MAX + 1;
0077             goto out;
0078         }
0079 
0080         c = PLANE_SIZE +
0081             ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
0082             (next & SURROGATE_CHAR_MASK);
0083     }
0084 out:
0085     *ret = c;
0086     return str_i_idx - start_idx;
0087 }
0088 
0089 
0090 static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
0091                   int *str_o_idx,
0092                   const uint8_t *str_i, int str_i_max_len,
0093                   int *str_i_idx,
0094                   int u_ch, int *needsCRC,
0095                   int (*conv_f)(wchar_t, unsigned char *, int),
0096                   int translate)
0097 {
0098     unicode_t c;
0099     int illChar = 0;
0100     int len, gotch = 0;
0101 
0102     while (!gotch && *str_i_idx < str_i_max_len) {
0103         if (*str_o_idx >= str_o_max_len) {
0104             *needsCRC = 1;
0105             return gotch;
0106         }
0107 
0108         len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
0109                      &c);
0110         /* These chars cannot be converted. Replace them. */
0111         if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
0112             (translate && c == '/')) {
0113             illChar = 1;
0114             if (!translate)
0115                 gotch = 1;
0116         } else if (illChar)
0117             break;
0118         else
0119             gotch = 1;
0120         *str_i_idx += len;
0121     }
0122     if (illChar) {
0123         *needsCRC = 1;
0124         c = ILLEGAL_CHAR_MARK;
0125         gotch = 1;
0126     }
0127     if (gotch) {
0128         if (conv_f) {
0129             len = conv_f(c, &str_o[*str_o_idx],
0130                      str_o_max_len - *str_o_idx);
0131         } else {
0132             len = utf32_to_utf8(c, &str_o[*str_o_idx],
0133                         str_o_max_len - *str_o_idx);
0134             if (len < 0)
0135                 len = -ENAMETOOLONG;
0136         }
0137         /* Valid character? */
0138         if (len >= 0)
0139             *str_o_idx += len;
0140         else if (len == -ENAMETOOLONG) {
0141             *needsCRC = 1;
0142             gotch = 0;
0143         } else {
0144             str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
0145             *needsCRC = 1;
0146         }
0147     }
0148     return gotch;
0149 }
0150 
0151 static int udf_name_from_CS0(struct super_block *sb,
0152                  uint8_t *str_o, int str_max_len,
0153                  const uint8_t *ocu, int ocu_len,
0154                  int translate)
0155 {
0156     uint32_t c;
0157     uint8_t cmp_id;
0158     int idx, len;
0159     int u_ch;
0160     int needsCRC = 0;
0161     int ext_i_len, ext_max_len;
0162     int str_o_len = 0;  /* Length of resulting output */
0163     int ext_o_len = 0;  /* Extension output length */
0164     int ext_crc_len = 0;    /* Extension output length if used with CRC */
0165     int i_ext = -1;     /* Extension position in input buffer */
0166     int o_crc = 0;      /* Rightmost possible output pos for CRC+ext */
0167     unsigned short valueCRC;
0168     uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1];
0169     uint8_t crc[CRC_LEN];
0170     int (*conv_f)(wchar_t, unsigned char *, int);
0171 
0172     if (str_max_len <= 0)
0173         return 0;
0174 
0175     if (ocu_len == 0) {
0176         memset(str_o, 0, str_max_len);
0177         return 0;
0178     }
0179 
0180     if (UDF_SB(sb)->s_nls_map)
0181         conv_f = UDF_SB(sb)->s_nls_map->uni2char;
0182     else
0183         conv_f = NULL;
0184 
0185     cmp_id = ocu[0];
0186     if (cmp_id != 8 && cmp_id != 16) {
0187         memset(str_o, 0, str_max_len);
0188         pr_err("unknown compression code (%u)\n", cmp_id);
0189         return -EINVAL;
0190     }
0191     u_ch = cmp_id >> 3;
0192 
0193     ocu++;
0194     ocu_len--;
0195 
0196     if (ocu_len % u_ch) {
0197         pr_err("incorrect filename length (%d)\n", ocu_len + 1);
0198         return -EINVAL;
0199     }
0200 
0201     if (translate) {
0202         /* Look for extension */
0203         for (idx = ocu_len - u_ch, ext_i_len = 0;
0204              (idx >= 0) && (ext_i_len < EXT_SIZE);
0205              idx -= u_ch, ext_i_len++) {
0206             c = ocu[idx];
0207             if (u_ch > 1)
0208                 c = (c << 8) | ocu[idx + 1];
0209 
0210             if (c == EXT_MARK) {
0211                 if (ext_i_len)
0212                     i_ext = idx;
0213                 break;
0214             }
0215         }
0216         if (i_ext >= 0) {
0217             /* Convert extension */
0218             ext_max_len = min_t(int, sizeof(ext), str_max_len);
0219             ext[ext_o_len++] = EXT_MARK;
0220             idx = i_ext + u_ch;
0221             while (udf_name_conv_char(ext, ext_max_len, &ext_o_len,
0222                           ocu, ocu_len, &idx,
0223                           u_ch, &needsCRC,
0224                           conv_f, translate)) {
0225                 if ((ext_o_len + CRC_LEN) < str_max_len)
0226                     ext_crc_len = ext_o_len;
0227             }
0228         }
0229     }
0230 
0231     idx = 0;
0232     while (1) {
0233         if (translate && (idx == i_ext)) {
0234             if (str_o_len > (str_max_len - ext_o_len))
0235                 needsCRC = 1;
0236             break;
0237         }
0238 
0239         if (!udf_name_conv_char(str_o, str_max_len, &str_o_len,
0240                     ocu, ocu_len, &idx,
0241                     u_ch, &needsCRC, conv_f, translate))
0242             break;
0243 
0244         if (translate &&
0245             (str_o_len <= (str_max_len - ext_o_len - CRC_LEN)))
0246             o_crc = str_o_len;
0247     }
0248 
0249     if (translate) {
0250         if (str_o_len <= 2 && str_o[0] == '.' &&
0251             (str_o_len == 1 || str_o[1] == '.'))
0252             needsCRC = 1;
0253         if (needsCRC) {
0254             str_o_len = o_crc;
0255             valueCRC = crc_itu_t(0, ocu, ocu_len);
0256             crc[0] = CRC_MARK;
0257             crc[1] = hex_asc_upper_hi(valueCRC >> 8);
0258             crc[2] = hex_asc_upper_lo(valueCRC >> 8);
0259             crc[3] = hex_asc_upper_hi(valueCRC);
0260             crc[4] = hex_asc_upper_lo(valueCRC);
0261             len = min_t(int, CRC_LEN, str_max_len - str_o_len);
0262             memcpy(&str_o[str_o_len], crc, len);
0263             str_o_len += len;
0264             ext_o_len = ext_crc_len;
0265         }
0266         if (ext_o_len > 0) {
0267             memcpy(&str_o[str_o_len], ext, ext_o_len);
0268             str_o_len += ext_o_len;
0269         }
0270     }
0271 
0272     return str_o_len;
0273 }
0274 
0275 static int udf_name_to_CS0(struct super_block *sb,
0276                uint8_t *ocu, int ocu_max_len,
0277                const uint8_t *str_i, int str_len)
0278 {
0279     int i, len;
0280     unsigned int max_val;
0281     int u_len, u_ch;
0282     unicode_t uni_char;
0283     int (*conv_f)(const unsigned char *, int, wchar_t *);
0284 
0285     if (ocu_max_len <= 0)
0286         return 0;
0287 
0288     if (UDF_SB(sb)->s_nls_map)
0289         conv_f = UDF_SB(sb)->s_nls_map->char2uni;
0290     else
0291         conv_f = NULL;
0292 
0293     memset(ocu, 0, ocu_max_len);
0294     ocu[0] = 8;
0295     max_val = 0xff;
0296     u_ch = 1;
0297 
0298 try_again:
0299     u_len = 1;
0300     for (i = 0; i < str_len; i += len) {
0301         /* Name didn't fit? */
0302         if (u_len + u_ch > ocu_max_len)
0303             return 0;
0304         if (conv_f) {
0305             wchar_t wchar;
0306 
0307             len = conv_f(&str_i[i], str_len - i, &wchar);
0308             if (len > 0)
0309                 uni_char = wchar;
0310         } else {
0311             len = utf8_to_utf32(&str_i[i], str_len - i,
0312                         &uni_char);
0313         }
0314         /* Invalid character, deal with it */
0315         if (len <= 0 || uni_char > UNICODE_MAX) {
0316             len = 1;
0317             uni_char = '?';
0318         }
0319 
0320         if (uni_char > max_val) {
0321             unicode_t c;
0322 
0323             if (max_val == 0xff) {
0324                 max_val = 0xffff;
0325                 ocu[0] = 0x10;
0326                 u_ch = 2;
0327                 goto try_again;
0328             }
0329             /*
0330              * Use UTF-16 encoding for chars outside we
0331              * cannot encode directly.
0332              */
0333             if (u_len + 2 * u_ch > ocu_max_len)
0334                 return 0;
0335 
0336             uni_char -= PLANE_SIZE;
0337             c = SURROGATE_PAIR |
0338                 ((uni_char >> SURROGATE_CHAR_BITS) &
0339                  SURROGATE_CHAR_MASK);
0340             ocu[u_len++] = (uint8_t)(c >> 8);
0341             ocu[u_len++] = (uint8_t)(c & 0xff);
0342             uni_char = SURROGATE_PAIR | SURROGATE_LOW |
0343                     (uni_char & SURROGATE_CHAR_MASK);
0344         }
0345 
0346         if (max_val == 0xffff)
0347             ocu[u_len++] = (uint8_t)(uni_char >> 8);
0348         ocu[u_len++] = (uint8_t)(uni_char & 0xff);
0349     }
0350 
0351     return u_len;
0352 }
0353 
0354 /*
0355  * Convert CS0 dstring to output charset. Warning: This function may truncate
0356  * input string if it is too long as it is used for informational strings only
0357  * and it is better to truncate the string than to refuse mounting a media.
0358  */
0359 int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len,
0360               const uint8_t *ocu_i, int i_len)
0361 {
0362     int s_len = 0;
0363 
0364     if (i_len > 0) {
0365         s_len = ocu_i[i_len - 1];
0366         if (s_len >= i_len) {
0367             pr_warn("incorrect dstring lengths (%d/%d),"
0368                 " truncating\n", s_len, i_len);
0369             s_len = i_len - 1;
0370             /* 2-byte encoding? Need to round properly... */
0371             if (ocu_i[0] == 16)
0372                 s_len -= (s_len - 1) & 2;
0373         }
0374     }
0375 
0376     return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0);
0377 }
0378 
0379 int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen,
0380              uint8_t *dname, int dlen)
0381 {
0382     int ret;
0383 
0384     if (!slen)
0385         return -EIO;
0386 
0387     if (dlen <= 0)
0388         return 0;
0389 
0390     ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1);
0391     /* Zero length filename isn't valid... */
0392     if (ret == 0)
0393         ret = -EINVAL;
0394     return ret;
0395 }
0396 
0397 int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen,
0398              uint8_t *dname, int dlen)
0399 {
0400     return udf_name_to_CS0(sb, dname, dlen, sname, slen);
0401 }
0402