Back to home page

OSCL-LXR

 
 

    


0001 // SPDX-License-Identifier: GPL-2.0
0002 #include <string.h>
0003 #include "debug.h"
0004 
0005 #include "demangle-rust.h"
0006 
0007 /*
0008  * Mangled Rust symbols look like this:
0009  *
0010  *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
0011  *
0012  * The original symbol is:
0013  *
0014  *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
0015  *
0016  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
0017  * with "h". Rust does not have a global namespace between crates, an illusion
0018  * which Rust maintains by using the hash to distinguish things that would
0019  * otherwise have the same symbol.
0020  *
0021  * Any path component not starting with a XID_Start character is prefixed with
0022  * "_".
0023  *
0024  * The following escape sequences are used:
0025  *
0026  *     ","  =>  $C$
0027  *     "@"  =>  $SP$
0028  *     "*"  =>  $BP$
0029  *     "&"  =>  $RF$
0030  *     "<"  =>  $LT$
0031  *     ">"  =>  $GT$
0032  *     "("  =>  $LP$
0033  *     ")"  =>  $RP$
0034  *     " "  =>  $u20$
0035  *     "'"  =>  $u27$
0036  *     "["  =>  $u5b$
0037  *     "]"  =>  $u5d$
0038  *     "~"  =>  $u7e$
0039  *
0040  * A double ".." means "::" and a single "." means "-".
0041  *
0042  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
0043  */
0044 
0045 static const char *hash_prefix = "::h";
0046 static const size_t hash_prefix_len = 3;
0047 static const size_t hash_len = 16;
0048 
0049 static bool is_prefixed_hash(const char *start);
0050 static bool looks_like_rust(const char *sym, size_t len);
0051 static bool unescape(const char **in, char **out, const char *seq, char value);
0052 
0053 /*
0054  * INPUT:
0055  *     sym: symbol that has been through BFD-demangling
0056  *
0057  * This function looks for the following indicators:
0058  *
0059  *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
0060  *
0061  *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
0062  *     hex digits. This is true of 99.9998% of hashes so once in your life you
0063  *     may see a false negative. The point is to notice path components that
0064  *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
0065  *     this case a false positive (non-Rust symbol has an important path
0066  *     component removed because it looks like a Rust hash) is worse than a
0067  *     false negative (the rare Rust symbol is not demangled) so this sets the
0068  *     balance in favor of false negatives.
0069  *
0070  *  3. There must be no characters other than a-zA-Z0-9 and _.:$
0071  *
0072  *  4. There must be no unrecognized $-sign sequences.
0073  *
0074  *  5. There must be no sequence of three or more dots in a row ("...").
0075  */
0076 bool
0077 rust_is_mangled(const char *sym)
0078 {
0079     size_t len, len_without_hash;
0080 
0081     if (!sym)
0082         return false;
0083 
0084     len = strlen(sym);
0085     if (len <= hash_prefix_len + hash_len)
0086         /* Not long enough to contain "::h" + hash + something else */
0087         return false;
0088 
0089     len_without_hash = len - (hash_prefix_len + hash_len);
0090     if (!is_prefixed_hash(sym + len_without_hash))
0091         return false;
0092 
0093     return looks_like_rust(sym, len_without_hash);
0094 }
0095 
0096 /*
0097  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
0098  * digits must comprise between 5 and 15 (inclusive) distinct digits.
0099  */
0100 static bool is_prefixed_hash(const char *str)
0101 {
0102     const char *end;
0103     bool seen[16];
0104     size_t i;
0105     int count;
0106 
0107     if (strncmp(str, hash_prefix, hash_prefix_len))
0108         return false;
0109     str += hash_prefix_len;
0110 
0111     memset(seen, false, sizeof(seen));
0112     for (end = str + hash_len; str < end; str++)
0113         if (*str >= '0' && *str <= '9')
0114             seen[*str - '0'] = true;
0115         else if (*str >= 'a' && *str <= 'f')
0116             seen[*str - 'a' + 10] = true;
0117         else
0118             return false;
0119 
0120     /* Count how many distinct digits seen */
0121     count = 0;
0122     for (i = 0; i < 16; i++)
0123         if (seen[i])
0124             count++;
0125 
0126     return count >= 5 && count <= 15;
0127 }
0128 
0129 static bool looks_like_rust(const char *str, size_t len)
0130 {
0131     const char *end = str + len;
0132 
0133     while (str < end)
0134         switch (*str) {
0135         case '$':
0136             if (!strncmp(str, "$C$", 3))
0137                 str += 3;
0138             else if (!strncmp(str, "$SP$", 4)
0139                     || !strncmp(str, "$BP$", 4)
0140                     || !strncmp(str, "$RF$", 4)
0141                     || !strncmp(str, "$LT$", 4)
0142                     || !strncmp(str, "$GT$", 4)
0143                     || !strncmp(str, "$LP$", 4)
0144                     || !strncmp(str, "$RP$", 4))
0145                 str += 4;
0146             else if (!strncmp(str, "$u20$", 5)
0147                     || !strncmp(str, "$u27$", 5)
0148                     || !strncmp(str, "$u5b$", 5)
0149                     || !strncmp(str, "$u5d$", 5)
0150                     || !strncmp(str, "$u7e$", 5))
0151                 str += 5;
0152             else
0153                 return false;
0154             break;
0155         case '.':
0156             /* Do not allow three or more consecutive dots */
0157             if (!strncmp(str, "...", 3))
0158                 return false;
0159             /* Fall through */
0160         case 'a' ... 'z':
0161         case 'A' ... 'Z':
0162         case '0' ... '9':
0163         case '_':
0164         case ':':
0165             str++;
0166             break;
0167         default:
0168             return false;
0169         }
0170 
0171     return true;
0172 }
0173 
0174 /*
0175  * INPUT:
0176  *     sym: symbol for which rust_is_mangled(sym) returns true
0177  *
0178  * The input is demangled in-place because the mangled name is always longer
0179  * than the demangled one.
0180  */
0181 void
0182 rust_demangle_sym(char *sym)
0183 {
0184     const char *in;
0185     char *out;
0186     const char *end;
0187 
0188     if (!sym)
0189         return;
0190 
0191     in = sym;
0192     out = sym;
0193     end = sym + strlen(sym) - (hash_prefix_len + hash_len);
0194 
0195     while (in < end)
0196         switch (*in) {
0197         case '$':
0198             if (!(unescape(&in, &out, "$C$", ',')
0199                     || unescape(&in, &out, "$SP$", '@')
0200                     || unescape(&in, &out, "$BP$", '*')
0201                     || unescape(&in, &out, "$RF$", '&')
0202                     || unescape(&in, &out, "$LT$", '<')
0203                     || unescape(&in, &out, "$GT$", '>')
0204                     || unescape(&in, &out, "$LP$", '(')
0205                     || unescape(&in, &out, "$RP$", ')')
0206                     || unescape(&in, &out, "$u20$", ' ')
0207                     || unescape(&in, &out, "$u27$", '\'')
0208                     || unescape(&in, &out, "$u5b$", '[')
0209                     || unescape(&in, &out, "$u5d$", ']')
0210                     || unescape(&in, &out, "$u7e$", '~'))) {
0211                 pr_err("demangle-rust: unexpected escape sequence");
0212                 goto done;
0213             }
0214             break;
0215         case '_':
0216             /*
0217              * If this is the start of a path component and the next
0218              * character is an escape sequence, ignore the
0219              * underscore. The mangler inserts an underscore to make
0220              * sure the path component begins with a XID_Start
0221              * character.
0222              */
0223             if ((in == sym || in[-1] == ':') && in[1] == '$')
0224                 in++;
0225             else
0226                 *out++ = *in++;
0227             break;
0228         case '.':
0229             if (in[1] == '.') {
0230                 /* ".." becomes "::" */
0231                 *out++ = ':';
0232                 *out++ = ':';
0233                 in += 2;
0234             } else {
0235                 /* "." becomes "-" */
0236                 *out++ = '-';
0237                 in++;
0238             }
0239             break;
0240         case 'a' ... 'z':
0241         case 'A' ... 'Z':
0242         case '0' ... '9':
0243         case ':':
0244             *out++ = *in++;
0245             break;
0246         default:
0247             pr_err("demangle-rust: unexpected character '%c' in symbol\n",
0248                 *in);
0249             goto done;
0250         }
0251 
0252 done:
0253     *out = '\0';
0254 }
0255 
0256 static bool unescape(const char **in, char **out, const char *seq, char value)
0257 {
0258     size_t len = strlen(seq);
0259 
0260     if (strncmp(*in, seq, len))
0261         return false;
0262 
0263     **out = value;
0264 
0265     *in += len;
0266     *out += 1;
0267 
0268     return true;
0269 }