fs/cifs/cifs_unicode.c

0001 // SPDX-License-Identifier: GPL-2.0-or-later
0002 /*
0003  *
0004  *   Copyright (c) International Business Machines  Corp., 2000,2009
0005  *   Modified by Steve French (sfrench@us.ibm.com)
0006  */
0007 #include <linux/fs.h>
0008 #include <linux/slab.h>
0009 #include "cifs_fs_sb.h"
0010 #include "cifs_unicode.h"
0011 #include "cifs_uniupr.h"
0012 #include "cifspdu.h"
0013 #include "cifsglob.h"
0014 #include "cifs_debug.h"
0015
0016 int cifs_remap(struct cifs_sb_info *cifs_sb)
0017 {
0018     int map_type;
0019
0020     if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
0021         map_type = SFM_MAP_UNI_RSVD;
0022     else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
0023         map_type = SFU_MAP_UNI_RSVD;
0024     else
0025         map_type = NO_MAP_UNI_RSVD;
0026
0027     return map_type;
0028 }
0029
0030 /* Convert character using the SFU - "Services for Unix" remapping range */
0031 static bool
0032 convert_sfu_char(const __u16 src_char, char *target)
0033 {
0034     /*
0035      * BB: Cannot handle remapping UNI_SLASH until all the calls to
0036      *     build_path_from_dentry are modified, as they use slash as
0037      *     separator.
0038      */
0039     switch (src_char) {
0040     case UNI_COLON:
0041         *target = ':';
0042         break;
0043     case UNI_ASTERISK:
0044         *target = '*';
0045         break;
0046     case UNI_QUESTION:
0047         *target = '?';
0048         break;
0049     case UNI_PIPE:
0050         *target = '|';
0051         break;
0052     case UNI_GRTRTHAN:
0053         *target = '>';
0054         break;
0055     case UNI_LESSTHAN:
0056         *target = '<';
0057         break;
0058     default:
0059         return false;
0060     }
0061     return true;
0062 }
0063
0064 /* Convert character using the SFM - "Services for Mac" remapping range */
0065 static bool
0066 convert_sfm_char(const __u16 src_char, char *target)
0067 {
0068     if (src_char >= 0xF001 && src_char <= 0xF01F) {
0069         *target = src_char - 0xF000;
0070         return true;
0071     }
0072     switch (src_char) {
0073     case SFM_COLON:
0074         *target = ':';
0075         break;
0076     case SFM_DOUBLEQUOTE:
0077         *target = '"';
0078         break;
0079     case SFM_ASTERISK:
0080         *target = '*';
0081         break;
0082     case SFM_QUESTION:
0083         *target = '?';
0084         break;
0085     case SFM_PIPE:
0086         *target = '|';
0087         break;
0088     case SFM_GRTRTHAN:
0089         *target = '>';
0090         break;
0091     case SFM_LESSTHAN:
0092         *target = '<';
0093         break;
0094     case SFM_SPACE:
0095         *target = ' ';
0096         break;
0097     case SFM_PERIOD:
0098         *target = '.';
0099         break;
0100     default:
0101         return false;
0102     }
0103     return true;
0104 }
0105
0106
0107 /*
0108  * cifs_mapchar - convert a host-endian char to proper char in codepage
0109  * @target - where converted character should be copied
0110  * @src_char - 2 byte host-endian source character
0111  * @cp - codepage to which character should be converted
0112  * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
0113  *
0114  * This function handles the conversion of a single character. It is the
0115  * responsibility of the caller to ensure that the target buffer is large
0116  * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
0117  */
0118 static int
0119 cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp,
0120          int maptype)
0121 {
0122     int len = 1;
0123     __u16 src_char;
0124
0125     src_char = *from;
0126
0127     if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
0128         return len;
0129     else if ((maptype == SFU_MAP_UNI_RSVD) &&
0130           convert_sfu_char(src_char, target))
0131         return len;
0132
0133     /* if character not one of seven in special remap set */
0134     len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
0135     if (len <= 0)
0136         goto surrogate_pair;
0137
0138     return len;
0139
0140 surrogate_pair:
0141     /* convert SURROGATE_PAIR and IVS */
0142     if (strcmp(cp->charset, "utf8"))
0143         goto unknown;
0144     len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6);
0145     if (len <= 0)
0146         goto unknown;
0147     return len;
0148
0149 unknown:
0150     *target = '?';
0151     len = 1;
0152     return len;
0153 }
0154
0155 /*
0156  * cifs_from_utf16 - convert utf16le string to local charset
0157  * @to - destination buffer
0158  * @from - source buffer
0159  * @tolen - destination buffer size (in bytes)
0160  * @fromlen - source buffer size (in bytes)
0161  * @codepage - codepage to which characters should be converted
0162  * @mapchar - should characters be remapped according to the mapchars option?
0163  *
0164  * Convert a little-endian utf16le string (as sent by the server) to a string
0165  * in the provided codepage. The tolen and fromlen parameters are to ensure
0166  * that the code doesn't walk off of the end of the buffer (which is always
0167  * a danger if the alignment of the source buffer is off). The destination
0168  * string is always properly null terminated and fits in the destination
0169  * buffer. Returns the length of the destination string in bytes (including
0170  * null terminator).
0171  *
0172  * Note that some windows versions actually send multiword UTF-16 characters
0173  * instead of straight UTF16-2. The linux nls routines however aren't able to
0174  * deal with those characters properly. In the event that we get some of
0175  * those characters, they won't be translated properly.
0176  */
0177 int
0178 cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
0179         const struct nls_table *codepage, int map_type)
0180 {
0181     int i, charlen, safelen;
0182     int outlen = 0;
0183     int nullsize = nls_nullsize(codepage);
0184     int fromwords = fromlen / 2;
0185     char tmp[NLS_MAX_CHARSET_SIZE];
0186     __u16 ftmp[3];      /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */
0187
0188     /*
0189      * because the chars can be of varying widths, we need to take care
0190      * not to overflow the destination buffer when we get close to the
0191      * end of it. Until we get to this offset, we don't need to check
0192      * for overflow however.
0193      */
0194     safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
0195
0196     for (i = 0; i < fromwords; i++) {
0197         ftmp[0] = get_unaligned_le16(&from[i]);
0198         if (ftmp[0] == 0)
0199             break;
0200         if (i + 1 < fromwords)
0201             ftmp[1] = get_unaligned_le16(&from[i + 1]);
0202         else
0203             ftmp[1] = 0;
0204         if (i + 2 < fromwords)
0205             ftmp[2] = get_unaligned_le16(&from[i + 2]);
0206         else
0207             ftmp[2] = 0;
0208
0209         /*
0210          * check to see if converting this character might make the
0211          * conversion bleed into the null terminator
0212          */
0213         if (outlen >= safelen) {
0214             charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
0215             if ((outlen + charlen) > (tolen - nullsize))
0216                 break;
0217         }
0218
0219         /* put converted char into 'to' buffer */
0220         charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
0221         outlen += charlen;
0222
0223         /* charlen (=bytes of UTF-8 for 1 character)
0224          * 4bytes UTF-8(surrogate pair) is charlen=4
0225          *   (4bytes UTF-16 code)
0226          * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
0227          *   (2 UTF-8 pairs divided to 2 UTF-16 pairs) */
0228         if (charlen == 4)
0229             i++;
0230         else if (charlen >= 5)
0231             /* 5-6bytes UTF-8 */
0232             i += 2;
0233     }
0234
0235     /* properly null-terminate string */
0236     for (i = 0; i < nullsize; i++)
0237         to[outlen++] = 0;
0238
0239     return outlen;
0240 }
0241
0242 /*
0243  * NAME:    cifs_strtoUTF16()
0244  *
0245  * FUNCTION:    Convert character string to unicode string
0246  *
0247  */
0248 int
0249 cifs_strtoUTF16(__le16 *to, const char *from, int len,
0250           const struct nls_table *codepage)
0251 {
0252     int charlen;
0253     int i;
0254     wchar_t wchar_to; /* needed to quiet sparse */
0255
0256     /* special case for utf8 to handle no plane0 chars */
0257     if (!strcmp(codepage->charset, "utf8")) {
0258         /*
0259          * convert utf8 -> utf16, we assume we have enough space
0260          * as caller should have assumed conversion does not overflow
0261          * in destination len is length in wchar_t units (16bits)
0262          */
0263         i  = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN,
0264                        (wchar_t *) to, len);
0265
0266         /* if success terminate and exit */
0267         if (i >= 0)
0268             goto success;
0269         /*
0270          * if fails fall back to UCS encoding as this
0271          * function should not return negative values
0272          * currently can fail only if source contains
0273          * invalid encoded characters
0274          */
0275     }
0276
0277     for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
0278         charlen = codepage->char2uni(from, len, &wchar_to);
0279         if (charlen < 1) {
0280             cifs_dbg(VFS, "strtoUTF16: char2uni of 0x%x returned %d\n",
0281                  *from, charlen);
0282             /* A question mark */
0283             wchar_to = 0x003f;
0284             charlen = 1;
0285         }
0286         put_unaligned_le16(wchar_to, &to[i]);
0287     }
0288
0289 success:
0290     put_unaligned_le16(0, &to[i]);
0291     return i;
0292 }
0293
0294 /*
0295  * cifs_utf16_bytes - how long will a string be after conversion?
0296  * @utf16 - pointer to input string
0297  * @maxbytes - don't go past this many bytes of input string
0298  * @codepage - destination codepage
0299  *
0300  * Walk a utf16le string and return the number of bytes that the string will
0301  * be after being converted to the given charset, not including any null
0302  * termination required. Don't walk past maxbytes in the source buffer.
0303  */
0304 int
0305 cifs_utf16_bytes(const __le16 *from, int maxbytes,
0306         const struct nls_table *codepage)
0307 {
0308     int i;
0309     int charlen, outlen = 0;
0310     int maxwords = maxbytes / 2;
0311     char tmp[NLS_MAX_CHARSET_SIZE];
0312     __u16 ftmp[3];
0313
0314     for (i = 0; i < maxwords; i++) {
0315         ftmp[0] = get_unaligned_le16(&from[i]);
0316         if (ftmp[0] == 0)
0317             break;
0318         if (i + 1 < maxwords)
0319             ftmp[1] = get_unaligned_le16(&from[i + 1]);
0320         else
0321             ftmp[1] = 0;
0322         if (i + 2 < maxwords)
0323             ftmp[2] = get_unaligned_le16(&from[i + 2]);
0324         else
0325             ftmp[2] = 0;
0326
0327         charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD);
0328         outlen += charlen;
0329     }
0330
0331     return outlen;
0332 }
0333
0334 /*
0335  * cifs_strndup_from_utf16 - copy a string from wire format to the local
0336  * codepage
0337  * @src - source string
0338  * @maxlen - don't walk past this many bytes in the source string
0339  * @is_unicode - is this a unicode string?
0340  * @codepage - destination codepage
0341  *
0342  * Take a string given by the server, convert it to the local codepage and
0343  * put it in a new buffer. Returns a pointer to the new string or NULL on
0344  * error.
0345  */
0346 char *
0347 cifs_strndup_from_utf16(const char *src, const int maxlen,
0348             const bool is_unicode, const struct nls_table *codepage)
0349 {
0350     int len;
0351     char *dst;
0352
0353     if (is_unicode) {
0354         len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
0355         len += nls_nullsize(codepage);
0356         dst = kmalloc(len, GFP_KERNEL);
0357         if (!dst)
0358             return NULL;
0359         cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
0360                 NO_MAP_UNI_RSVD);
0361     } else {
0362         dst = kstrndup(src, maxlen, GFP_KERNEL);
0363     }
0364
0365     return dst;
0366 }
0367
0368 static __le16 convert_to_sfu_char(char src_char)
0369 {
0370     __le16 dest_char;
0371
0372     switch (src_char) {
0373     case ':':
0374         dest_char = cpu_to_le16(UNI_COLON);
0375         break;
0376     case '*':
0377         dest_char = cpu_to_le16(UNI_ASTERISK);
0378         break;
0379     case '?':
0380         dest_char = cpu_to_le16(UNI_QUESTION);
0381         break;
0382     case '<':
0383         dest_char = cpu_to_le16(UNI_LESSTHAN);
0384         break;
0385     case '>':
0386         dest_char = cpu_to_le16(UNI_GRTRTHAN);
0387         break;
0388     case '|':
0389         dest_char = cpu_to_le16(UNI_PIPE);
0390         break;
0391     default:
0392         dest_char = 0;
0393     }
0394
0395     return dest_char;
0396 }
0397
0398 static __le16 convert_to_sfm_char(char src_char, bool end_of_string)
0399 {
0400     __le16 dest_char;
0401
0402     if (src_char >= 0x01 && src_char <= 0x1F) {
0403         dest_char = cpu_to_le16(src_char + 0xF000);
0404         return dest_char;
0405     }
0406     switch (src_char) {
0407     case ':':
0408         dest_char = cpu_to_le16(SFM_COLON);
0409         break;
0410     case '"':
0411         dest_char = cpu_to_le16(SFM_DOUBLEQUOTE);
0412         break;
0413     case '*':
0414         dest_char = cpu_to_le16(SFM_ASTERISK);
0415         break;
0416     case '?':
0417         dest_char = cpu_to_le16(SFM_QUESTION);
0418         break;
0419     case '<':
0420         dest_char = cpu_to_le16(SFM_LESSTHAN);
0421         break;
0422     case '>':
0423         dest_char = cpu_to_le16(SFM_GRTRTHAN);
0424         break;
0425     case '|':
0426         dest_char = cpu_to_le16(SFM_PIPE);
0427         break;
0428     case '.':
0429         if (end_of_string)
0430             dest_char = cpu_to_le16(SFM_PERIOD);
0431         else
0432             dest_char = 0;
0433         break;
0434     case ' ':
0435         if (end_of_string)
0436             dest_char = cpu_to_le16(SFM_SPACE);
0437         else
0438             dest_char = 0;
0439         break;
0440     default:
0441         dest_char = 0;
0442     }
0443
0444     return dest_char;
0445 }
0446
0447 /*
0448  * Convert 16 bit Unicode pathname to wire format from string in current code
0449  * page. Conversion may involve remapping up the six characters that are
0450  * only legal in POSIX-like OS (if they are present in the string). Path
0451  * names are little endian 16 bit Unicode on the wire
0452  */
0453 int
0454 cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
0455          const struct nls_table *cp, int map_chars)
0456 {
0457     int i, charlen;
0458     int j = 0;
0459     char src_char;
0460     __le16 dst_char;
0461     wchar_t tmp;
0462     wchar_t *wchar_to;  /* UTF-16 */
0463     int ret;
0464     unicode_t u;
0465
0466     if (map_chars == NO_MAP_UNI_RSVD)
0467         return cifs_strtoUTF16(target, source, PATH_MAX, cp);
0468
0469     wchar_to = kzalloc(6, GFP_KERNEL);
0470
0471     for (i = 0; i < srclen; j++) {
0472         src_char = source[i];
0473         charlen = 1;
0474
0475         /* check if end of string */
0476         if (src_char == 0)
0477             goto ctoUTF16_out;
0478
0479         /* see if we must remap this char */
0480         if (map_chars == SFU_MAP_UNI_RSVD)
0481             dst_char = convert_to_sfu_char(src_char);
0482         else if (map_chars == SFM_MAP_UNI_RSVD) {
0483             bool end_of_string;
0484
0485             /**
0486              * Remap spaces and periods found at the end of every
0487              * component of the path. The special cases of '.' and
0488              * '..' do not need to be dealt with explicitly because
0489              * they are addressed in namei.c:link_path_walk().
0490              **/
0491             if ((i == srclen - 1) || (source[i+1] == '\\'))
0492                 end_of_string = true;
0493             else
0494                 end_of_string = false;
0495
0496             dst_char = convert_to_sfm_char(src_char, end_of_string);
0497         } else
0498             dst_char = 0;
0499         /*
0500          * FIXME: We can not handle remapping backslash (UNI_SLASH)
0501          * until all the calls to build_path_from_dentry are modified,
0502          * as they use backslash as separator.
0503          */
0504         if (dst_char == 0) {
0505             charlen = cp->char2uni(source + i, srclen - i, &tmp);
0506             dst_char = cpu_to_le16(tmp);
0507
0508             /*
0509              * if no match, use question mark, which at least in
0510              * some cases serves as wild card
0511              */
0512             if (charlen > 0)
0513                 goto ctoUTF16;
0514
0515             /* convert SURROGATE_PAIR */
0516             if (strcmp(cp->charset, "utf8") || !wchar_to)
0517                 goto unknown;
0518             if (*(source + i) & 0x80) {
0519                 charlen = utf8_to_utf32(source + i, 6, &u);
0520                 if (charlen < 0)
0521                     goto unknown;
0522             } else
0523                 goto unknown;
0524             ret  = utf8s_to_utf16s(source + i, charlen,
0525                            UTF16_LITTLE_ENDIAN,
0526                            wchar_to, 6);
0527             if (ret < 0)
0528                 goto unknown;
0529
0530             i += charlen;
0531             dst_char = cpu_to_le16(*wchar_to);
0532             if (charlen <= 3)
0533                 /* 1-3bytes UTF-8 to 2bytes UTF-16 */
0534                 put_unaligned(dst_char, &target[j]);
0535             else if (charlen == 4) {
0536                 /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
0537                  * 7-8bytes UTF-8(IVS) divided to 2 UTF-16
0538                  *   (charlen=3+4 or 4+4) */
0539                 put_unaligned(dst_char, &target[j]);
0540                 dst_char = cpu_to_le16(*(wchar_to + 1));
0541                 j++;
0542                 put_unaligned(dst_char, &target[j]);
0543             } else if (charlen >= 5) {
0544                 /* 5-6bytes UTF-8 to 6bytes UTF-16 */
0545                 put_unaligned(dst_char, &target[j]);
0546                 dst_char = cpu_to_le16(*(wchar_to + 1));
0547                 j++;
0548                 put_unaligned(dst_char, &target[j]);
0549                 dst_char = cpu_to_le16(*(wchar_to + 2));
0550                 j++;
0551                 put_unaligned(dst_char, &target[j]);
0552             }
0553             continue;
0554
0555 unknown:
0556             dst_char = cpu_to_le16(0x003f);
0557             charlen = 1;
0558         }
0559
0560 ctoUTF16:
0561         /*
0562          * character may take more than one byte in the source string,
0563          * but will take exactly two bytes in the target string
0564          */
0565         i += charlen;
0566         put_unaligned(dst_char, &target[j]);
0567     }
0568
0569 ctoUTF16_out:
0570     put_unaligned(0, &target[j]); /* Null terminate target unicode string */
0571     kfree(wchar_to);
0572     return j;
0573 }
0574
0575 /*
0576  * cifs_local_to_utf16_bytes - how long will a string be after conversion?
0577  * @from - pointer to input string
0578  * @maxbytes - don't go past this many bytes of input string
0579  * @codepage - source codepage
0580  *
0581  * Walk a string and return the number of bytes that the string will
0582  * be after being converted to the given charset, not including any null
0583  * termination required. Don't walk past maxbytes in the source buffer.
0584  */
0585
0586 static int
0587 cifs_local_to_utf16_bytes(const char *from, int len,
0588               const struct nls_table *codepage)
0589 {
0590     int charlen;
0591     int i;
0592     wchar_t wchar_to;
0593
0594     for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
0595         charlen = codepage->char2uni(from, len, &wchar_to);
0596         /* Failed conversion defaults to a question mark */
0597         if (charlen < 1)
0598             charlen = 1;
0599     }
0600     return 2 * i; /* UTF16 characters are two bytes */
0601 }
0602
0603 /*
0604  * cifs_strndup_to_utf16 - copy a string to wire format from the local codepage
0605  * @src - source string
0606  * @maxlen - don't walk past this many bytes in the source string
0607  * @utf16_len - the length of the allocated string in bytes (including null)
0608  * @cp - source codepage
0609  * @remap - map special chars
0610  *
0611  * Take a string convert it from the local codepage to UTF16 and
0612  * put it in a new buffer. Returns a pointer to the new string or NULL on
0613  * error.
0614  */
0615 __le16 *
0616 cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len,
0617               const struct nls_table *cp, int remap)
0618 {
0619     int len;
0620     __le16 *dst;
0621
0622     len = cifs_local_to_utf16_bytes(src, maxlen, cp);
0623     len += 2; /* NULL */
0624     dst = kmalloc(len, GFP_KERNEL);
0625     if (!dst) {
0626         *utf16_len = 0;
0627         return NULL;
0628     }
0629     cifsConvertToUTF16(dst, src, strlen(src), cp, remap);
0630     *utf16_len = len;
0631     return dst;
0632 }