0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018 package org.apache.spark.unsafe.types;
0019
0020 import java.io.ByteArrayOutputStream;
0021 import java.io.IOException;
0022 import java.nio.ByteBuffer;
0023 import java.nio.ByteOrder;
0024 import java.nio.charset.StandardCharsets;
0025 import java.util.*;
0026
0027 import com.google.common.collect.ImmutableMap;
0028 import org.apache.spark.unsafe.Platform;
0029 import org.junit.Test;
0030
0031 import static org.junit.Assert.*;
0032
0033 import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
0034 import static org.apache.spark.unsafe.types.UTF8String.*;
0035
0036 public class UTF8StringSuite {
0037
0038 private static void checkBasic(String str, int len) {
0039 UTF8String s1 = fromString(str);
0040 UTF8String s2 = fromBytes(str.getBytes(StandardCharsets.UTF_8));
0041 assertEquals(len, s1.numChars());
0042 assertEquals(len, s2.numChars());
0043
0044 assertEquals(str, s1.toString());
0045 assertEquals(str, s2.toString());
0046 assertEquals(s1, s2);
0047
0048 assertEquals(s1.hashCode(), s2.hashCode());
0049
0050 assertEquals(0, s1.compareTo(s2));
0051
0052 assertTrue(s1.contains(s2));
0053 assertTrue(s2.contains(s1));
0054 assertTrue(s1.startsWith(s2));
0055 assertTrue(s1.endsWith(s2));
0056 }
0057
0058 @Test
0059 public void basicTest() {
0060 checkBasic("", 0);
0061 checkBasic("¡", 1);
0062 checkBasic("ку", 2);
0063 checkBasic("hello", 5);
0064 checkBasic("大 千 世 界", 7);
0065 checkBasic("︽﹋%", 3);
0066 checkBasic("\uD83E\uDD19", 1);
0067 }
0068
0069 @Test
0070 public void emptyStringTest() {
0071 assertEquals(EMPTY_UTF8, fromString(""));
0072 assertEquals(EMPTY_UTF8, fromBytes(new byte[0]));
0073 assertEquals(0, EMPTY_UTF8.numChars());
0074 assertEquals(0, EMPTY_UTF8.numBytes());
0075 }
0076
0077 @Test
0078 public void prefix() {
0079 assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
0080 assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
0081 assertTrue(
0082 fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
0083 assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
0084 assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
0085
0086 byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
0087 byte[] buf2 = {1, 2, 3};
0088 UTF8String str1 = fromBytes(buf1, 0, 3);
0089 UTF8String str2 = fromBytes(buf1, 0, 8);
0090 UTF8String str3 = fromBytes(buf2);
0091 assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
0092 assertEquals(str1.getPrefix(), str3.getPrefix());
0093 }
0094
0095 @Test
0096 public void compareTo() {
0097 assertTrue(fromString("").compareTo(fromString("a")) < 0);
0098 assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
0099 assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
0100 assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
0101 assertTrue(fromString("aBcabcabc").compareTo(fromString("Abcabcabc")) > 0);
0102 assertTrue(fromString("Abcabcabc").compareTo(fromString("abcabcabC")) < 0);
0103 assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabC")) > 0);
0104
0105 assertTrue(fromString("abc").compareTo(fromString("世界")) < 0);
0106 assertTrue(fromString("你好").compareTo(fromString("世界")) > 0);
0107 assertTrue(fromString("你好123").compareTo(fromString("你好122")) > 0);
0108 }
0109
0110 protected static void testUpperandLower(String upper, String lower) {
0111 UTF8String us = fromString(upper);
0112 UTF8String ls = fromString(lower);
0113 assertEquals(ls, us.toLowerCase());
0114 assertEquals(us, ls.toUpperCase());
0115 assertEquals(us, us.toUpperCase());
0116 assertEquals(ls, ls.toLowerCase());
0117 }
0118
0119 @Test
0120 public void upperAndLower() {
0121 testUpperandLower("", "");
0122 testUpperandLower("0123456", "0123456");
0123 testUpperandLower("ABCXYZ", "abcxyz");
0124 testUpperandLower("ЀЁЂѺΏỀ", "ѐёђѻώề");
0125 testUpperandLower("大千世界 数据砖头", "大千世界 数据砖头");
0126 }
0127
0128 @Test
0129 public void titleCase() {
0130 assertEquals(fromString(""), fromString("").toTitleCase());
0131 assertEquals(fromString("Ab Bc Cd"), fromString("ab bc cd").toTitleCase());
0132 assertEquals(fromString("Ѐ Ё Ђ Ѻ Ώ Ề"), fromString("ѐ ё ђ ѻ ώ ề").toTitleCase());
0133 assertEquals(fromString("大千世界 数据砖头"), fromString("大千世界 数据砖头").toTitleCase());
0134 }
0135
0136 @Test
0137 public void concatTest() {
0138 assertEquals(EMPTY_UTF8, concat());
0139 assertNull(concat((UTF8String) null));
0140 assertEquals(EMPTY_UTF8, concat(EMPTY_UTF8));
0141 assertEquals(fromString("ab"), concat(fromString("ab")));
0142 assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
0143 assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
0144 assertNull(concat(fromString("a"), null, fromString("c")));
0145 assertNull(concat(fromString("a"), null, null));
0146 assertNull(concat(null, null, null));
0147 assertEquals(fromString("数据砖头"), concat(fromString("数据"), fromString("砖头")));
0148 }
0149
0150 @Test
0151 public void concatWsTest() {
0152
0153 assertNull(concatWs(null, (UTF8String) null));
0154 assertNull(concatWs(null, fromString("a")));
0155
0156
0157 UTF8String sep = fromString("哈哈");
0158 assertEquals(
0159 EMPTY_UTF8,
0160 concatWs(sep, EMPTY_UTF8));
0161 assertEquals(
0162 fromString("ab"),
0163 concatWs(sep, fromString("ab")));
0164 assertEquals(
0165 fromString("a哈哈b"),
0166 concatWs(sep, fromString("a"), fromString("b")));
0167 assertEquals(
0168 fromString("a哈哈b哈哈c"),
0169 concatWs(sep, fromString("a"), fromString("b"), fromString("c")));
0170 assertEquals(
0171 fromString("a哈哈c"),
0172 concatWs(sep, fromString("a"), null, fromString("c")));
0173 assertEquals(
0174 fromString("a"),
0175 concatWs(sep, fromString("a"), null, null));
0176 assertEquals(
0177 EMPTY_UTF8,
0178 concatWs(sep, null, null, null));
0179 assertEquals(
0180 fromString("数据哈哈砖头"),
0181 concatWs(sep, fromString("数据"), fromString("砖头")));
0182 }
0183
0184 @Test
0185 public void contains() {
0186 assertTrue(EMPTY_UTF8.contains(EMPTY_UTF8));
0187 assertTrue(fromString("hello").contains(fromString("ello")));
0188 assertFalse(fromString("hello").contains(fromString("vello")));
0189 assertFalse(fromString("hello").contains(fromString("hellooo")));
0190 assertTrue(fromString("大千世界").contains(fromString("千世界")));
0191 assertFalse(fromString("大千世界").contains(fromString("世千")));
0192 assertFalse(fromString("大千世界").contains(fromString("大千世界好")));
0193 }
0194
0195 @Test
0196 public void startsWith() {
0197 assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8));
0198 assertTrue(fromString("hello").startsWith(fromString("hell")));
0199 assertFalse(fromString("hello").startsWith(fromString("ell")));
0200 assertFalse(fromString("hello").startsWith(fromString("hellooo")));
0201 assertTrue(fromString("数据砖头").startsWith(fromString("数据")));
0202 assertFalse(fromString("大千世界").startsWith(fromString("千")));
0203 assertFalse(fromString("大千世界").startsWith(fromString("大千世界好")));
0204 }
0205
0206 @Test
0207 public void endsWith() {
0208 assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8));
0209 assertTrue(fromString("hello").endsWith(fromString("ello")));
0210 assertFalse(fromString("hello").endsWith(fromString("ellov")));
0211 assertFalse(fromString("hello").endsWith(fromString("hhhello")));
0212 assertTrue(fromString("大千世界").endsWith(fromString("世界")));
0213 assertFalse(fromString("大千世界").endsWith(fromString("世")));
0214 assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头")));
0215 }
0216
0217 @Test
0218 public void substring() {
0219 assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));
0220 assertEquals(fromString("el"), fromString("hello").substring(1, 3));
0221 assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
0222 assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
0223 assertEquals(fromString("头"), fromString("数据砖头").substring(3, 5));
0224 assertEquals(fromString("ߵ梷"), fromString("ߵ梷").substring(0, 2));
0225 }
0226
0227 @Test
0228 public void trims() {
0229 assertEquals(fromString("1"), fromString("1").trim());
0230 assertEquals(fromString("1"), fromString("1\t").trimAll());
0231
0232 assertEquals(fromString("hello"), fromString(" hello ").trim());
0233 assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
0234 assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
0235
0236 assertEquals(EMPTY_UTF8, EMPTY_UTF8.trim());
0237 assertEquals(EMPTY_UTF8, fromString(" ").trim());
0238 assertEquals(EMPTY_UTF8, fromString(" ").trimLeft());
0239 assertEquals(EMPTY_UTF8, fromString(" ").trimRight());
0240
0241 assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
0242 assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft());
0243 assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight());
0244
0245 assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
0246 assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
0247 assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
0248
0249 char[] charsLessThan0x20 = new char[10];
0250 Arrays.fill(charsLessThan0x20, (char)(' ' - 1));
0251 String stringStartingWithSpace =
0252 new String(charsLessThan0x20) + "hello" + new String(charsLessThan0x20);
0253 assertEquals(fromString(stringStartingWithSpace), fromString(stringStartingWithSpace).trim());
0254 assertEquals(fromString(stringStartingWithSpace),
0255 fromString(stringStartingWithSpace).trimLeft());
0256 assertEquals(fromString(stringStartingWithSpace),
0257 fromString(stringStartingWithSpace).trimRight());
0258 }
0259
0260 @Test
0261 public void indexOf() {
0262 assertEquals(0, EMPTY_UTF8.indexOf(EMPTY_UTF8, 0));
0263 assertEquals(-1, EMPTY_UTF8.indexOf(fromString("l"), 0));
0264 assertEquals(0, fromString("hello").indexOf(EMPTY_UTF8, 0));
0265 assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
0266 assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
0267 assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
0268 assertEquals(2, fromString("hello").indexOf(fromString("ll"), 0));
0269 assertEquals(-1, fromString("hello").indexOf(fromString("ll"), 4));
0270 assertEquals(1, fromString("数据砖头").indexOf(fromString("据砖"), 0));
0271 assertEquals(-1, fromString("数据砖头").indexOf(fromString("数"), 3));
0272 assertEquals(0, fromString("数据砖头").indexOf(fromString("数"), 0));
0273 assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
0274 }
0275
0276 @Test
0277 public void substring_index() {
0278 assertEquals(fromString("www.apache.org"),
0279 fromString("www.apache.org").subStringIndex(fromString("."), 3));
0280 assertEquals(fromString("www.apache"),
0281 fromString("www.apache.org").subStringIndex(fromString("."), 2));
0282 assertEquals(fromString("www"),
0283 fromString("www.apache.org").subStringIndex(fromString("."), 1));
0284 assertEquals(fromString(""),
0285 fromString("www.apache.org").subStringIndex(fromString("."), 0));
0286 assertEquals(fromString("org"),
0287 fromString("www.apache.org").subStringIndex(fromString("."), -1));
0288 assertEquals(fromString("apache.org"),
0289 fromString("www.apache.org").subStringIndex(fromString("."), -2));
0290 assertEquals(fromString("www.apache.org"),
0291 fromString("www.apache.org").subStringIndex(fromString("."), -3));
0292
0293 assertEquals(fromString(""),
0294 fromString("").subStringIndex(fromString("."), 1));
0295
0296 assertEquals(fromString(""),
0297 fromString("www.apache.org").subStringIndex(fromString(""), 1));
0298
0299 assertEquals(fromString("www.apache.org"),
0300 fromString("www.apache.org").subStringIndex(fromString("#"), 2));
0301
0302 assertEquals(fromString("www||apache"),
0303 fromString("www||apache||org").subStringIndex(fromString("||"), 2));
0304 assertEquals(fromString("apache||org"),
0305 fromString("www||apache||org").subStringIndex(fromString("||"), -2));
0306
0307 assertEquals(fromString("大千世界大"),
0308 fromString("大千世界大千世界").subStringIndex(fromString("千"), 2));
0309
0310 assertEquals(fromString("||"), fromString("||||||").subStringIndex(fromString("|||"), 3));
0311 assertEquals(fromString("|||"), fromString("||||||").subStringIndex(fromString("|||"), -4));
0312 }
0313
0314 @Test
0315 public void reverse() {
0316 assertEquals(fromString("olleh"), fromString("hello").reverse());
0317 assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
0318 assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
0319 assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
0320 }
0321
0322 @Test
0323 public void repeat() {
0324 assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
0325 assertEquals(fromString("数d"), fromString("数d").repeat(1));
0326 assertEquals(EMPTY_UTF8, fromString("数d").repeat(-1));
0327 }
0328
0329 @Test
0330 public void pad() {
0331 assertEquals(fromString("hel"), fromString("hello").lpad(3, fromString("????")));
0332 assertEquals(fromString("hello"), fromString("hello").lpad(5, fromString("????")));
0333 assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
0334 assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
0335 assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
0336 assertEquals(fromString("???????"), EMPTY_UTF8.lpad(7, fromString("?????")));
0337
0338 assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
0339 assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
0340 assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
0341 assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
0342 assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
0343 assertEquals(fromString("???????"), EMPTY_UTF8.rpad(7, fromString("?????")));
0344
0345 assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
0346 assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
0347 assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
0348 assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
0349 assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
0350 assertEquals(
0351 fromString("孙行者孙行者孙行数据砖头"),
0352 fromString("数据砖头").lpad(12, fromString("孙行者")));
0353
0354 assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
0355 assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
0356 assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
0357 assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
0358 assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
0359 assertEquals(
0360 fromString("数据砖头孙行者孙行者孙行"),
0361 fromString("数据砖头").rpad(12, fromString("孙行者")));
0362
0363 assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, fromString("孙行者")));
0364 assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, EMPTY_UTF8));
0365 assertEquals(fromString("数据砖头"), fromString("数据砖头").lpad(5, EMPTY_UTF8));
0366 assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, EMPTY_UTF8));
0367 assertEquals(EMPTY_UTF8, EMPTY_UTF8.lpad(3, EMPTY_UTF8));
0368
0369 assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, fromString("孙行者")));
0370 assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, EMPTY_UTF8));
0371 assertEquals(fromString("数据砖头"), fromString("数据砖头").rpad(5, EMPTY_UTF8));
0372 assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, EMPTY_UTF8));
0373 assertEquals(EMPTY_UTF8, EMPTY_UTF8.rpad(3, EMPTY_UTF8));
0374 }
0375
0376 @Test
0377 public void substringSQL() {
0378 UTF8String e = fromString("example");
0379 assertEquals(fromString("ex"), e.substringSQL(0, 2));
0380 assertEquals(fromString("ex"), e.substringSQL(1, 2));
0381 assertEquals(fromString("example"), e.substringSQL(0, 7));
0382 assertEquals(fromString("ex"), e.substringSQL(1, 2));
0383 assertEquals(fromString("example"), e.substringSQL(0, 100));
0384 assertEquals(fromString("example"), e.substringSQL(1, 100));
0385 assertEquals(fromString("xa"), e.substringSQL(2, 2));
0386 assertEquals(fromString("exampl"), e.substringSQL(1, 6));
0387 assertEquals(fromString("xample"), e.substringSQL(2, 100));
0388 assertEquals(fromString(""), e.substringSQL(0, 0));
0389 assertEquals(EMPTY_UTF8, e.substringSQL(100, 4));
0390 assertEquals(fromString("example"), e.substringSQL(0, Integer.MAX_VALUE));
0391 assertEquals(fromString("example"), e.substringSQL(1, Integer.MAX_VALUE));
0392 assertEquals(fromString("xample"), e.substringSQL(2, Integer.MAX_VALUE));
0393 }
0394
0395 @Test
0396 public void split() {
0397 UTF8String[] negativeAndZeroLimitCase =
0398 new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi"), fromString("")};
0399 assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 0),
0400 negativeAndZeroLimitCase));
0401 assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), -1),
0402 negativeAndZeroLimitCase));
0403 assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 2),
0404 new UTF8String[]{fromString("ab"), fromString("def,ghi,")}));
0405 }
0406
0407 @Test
0408 public void replace() {
0409 assertEquals(
0410 fromString("re123ace"),
0411 fromString("replace").replace(fromString("pl"), fromString("123")));
0412 assertEquals(
0413 fromString("reace"),
0414 fromString("replace").replace(fromString("pl"), fromString("")));
0415 assertEquals(
0416 fromString("replace"),
0417 fromString("replace").replace(fromString(""), fromString("123")));
0418
0419 assertEquals(
0420 fromString("a12ca12c"),
0421 fromString("abcabc").replace(fromString("b"), fromString("12")));
0422 assertEquals(
0423 fromString("adad"),
0424 fromString("abcdabcd").replace(fromString("bc"), fromString("")));
0425
0426 assertEquals(
0427 fromString("AbcAbc"),
0428 fromString("abcabc").replace(fromString("a"), fromString("A")));
0429 assertEquals(
0430 fromString("abcabc"),
0431 fromString("abcabc").replace(fromString("Z"), fromString("A")));
0432
0433 assertEquals(
0434 fromString("花ab界"),
0435 fromString("花花世界").replace(fromString("花世"), fromString("ab")));
0436 assertEquals(
0437 fromString("a水c"),
0438 fromString("a火c").replace(fromString("火"), fromString("水")));
0439
0440 assertEquals(
0441 fromString("abcd").repeat(17),
0442 fromString("a").repeat(17).replace(fromString("a"), fromString("abcd")));
0443 }
0444
0445 @Test
0446 public void levenshteinDistance() {
0447 assertEquals(0, EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8));
0448 assertEquals(1, EMPTY_UTF8.levenshteinDistance(fromString("a")));
0449 assertEquals(7, fromString("aaapppp").levenshteinDistance(EMPTY_UTF8));
0450 assertEquals(1, fromString("frog").levenshteinDistance(fromString("fog")));
0451 assertEquals(3, fromString("fly").levenshteinDistance(fromString("ant")));
0452 assertEquals(7, fromString("elephant").levenshteinDistance(fromString("hippo")));
0453 assertEquals(7, fromString("hippo").levenshteinDistance(fromString("elephant")));
0454 assertEquals(8, fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")));
0455 assertEquals(1, fromString("hello").levenshteinDistance(fromString("hallo")));
0456 assertEquals(4, fromString("世界千世").levenshteinDistance(fromString("千a世b")));
0457 }
0458
0459 @Test
0460 public void translate() {
0461 assertEquals(
0462 fromString("1a2s3ae"),
0463 fromString("translate").translate(ImmutableMap.of(
0464 'r', '1',
0465 'n', '2',
0466 'l', '3',
0467 't', '\0'
0468 )));
0469 assertEquals(
0470 fromString("translate"),
0471 fromString("translate").translate(new HashMap<>()));
0472 assertEquals(
0473 fromString("asae"),
0474 fromString("translate").translate(ImmutableMap.of(
0475 'r', '\0',
0476 'n', '\0',
0477 'l', '\0',
0478 't', '\0'
0479 )));
0480 assertEquals(
0481 fromString("aa世b"),
0482 fromString("花花世界").translate(ImmutableMap.of(
0483 '花', 'a',
0484 '界', 'b'
0485 )));
0486 }
0487
0488 @Test
0489 public void createBlankString() {
0490 assertEquals(fromString(" "), blankString(1));
0491 assertEquals(fromString(" "), blankString(2));
0492 assertEquals(fromString(" "), blankString(3));
0493 assertEquals(fromString(""), blankString(0));
0494 }
0495
0496 @Test
0497 public void findInSet() {
0498 assertEquals(1, fromString("ab").findInSet(fromString("ab")));
0499 assertEquals(2, fromString("a,b").findInSet(fromString("b")));
0500 assertEquals(3, fromString("abc,b,ab,c,def").findInSet(fromString("ab")));
0501 assertEquals(1, fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")));
0502 assertEquals(4, fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")));
0503 assertEquals(1, fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")));
0504 assertEquals(4, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")));
0505 assertEquals(6, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")));
0506 }
0507
0508 @Test
0509 public void soundex() {
0510 assertEquals(fromString("R163"), fromString("Robert").soundex());
0511 assertEquals(fromString("R163"), fromString("Rupert").soundex());
0512 assertEquals(fromString("R150"), fromString("Rubin").soundex());
0513 assertEquals(fromString("A261"), fromString("Ashcraft").soundex());
0514 assertEquals(fromString("A261"), fromString("Ashcroft").soundex());
0515 assertEquals(fromString("B620"), fromString("Burroughs").soundex());
0516 assertEquals(fromString("B620"), fromString("Burrows").soundex());
0517 assertEquals(fromString("E251"), fromString("Ekzampul").soundex());
0518 assertEquals(fromString("E251"), fromString("Example").soundex());
0519 assertEquals(fromString("E460"), fromString("Ellery").soundex());
0520 assertEquals(fromString("E460"), fromString("Euler").soundex());
0521 assertEquals(fromString("G200"), fromString("Ghosh").soundex());
0522 assertEquals(fromString("G200"), fromString("Gauss").soundex());
0523 assertEquals(fromString("G362"), fromString("Gutierrez").soundex());
0524 assertEquals(fromString("H416"), fromString("Heilbronn").soundex());
0525 assertEquals(fromString("H416"), fromString("Hilbert").soundex());
0526 assertEquals(fromString("J250"), fromString("Jackson").soundex());
0527 assertEquals(fromString("K530"), fromString("Kant").soundex());
0528 assertEquals(fromString("K530"), fromString("Knuth").soundex());
0529 assertEquals(fromString("L000"), fromString("Lee").soundex());
0530 assertEquals(fromString("L222"), fromString("Lukasiewicz").soundex());
0531 assertEquals(fromString("L222"), fromString("Lissajous").soundex());
0532 assertEquals(fromString("L300"), fromString("Ladd").soundex());
0533 assertEquals(fromString("L300"), fromString("Lloyd").soundex());
0534 assertEquals(fromString("M220"), fromString("Moses").soundex());
0535 assertEquals(fromString("O600"), fromString("O'Hara").soundex());
0536 assertEquals(fromString("P236"), fromString("Pfister").soundex());
0537 assertEquals(fromString("R150"), fromString("Rubin").soundex());
0538 assertEquals(fromString("R163"), fromString("Robert").soundex());
0539 assertEquals(fromString("R163"), fromString("Rupert").soundex());
0540 assertEquals(fromString("S532"), fromString("Soundex").soundex());
0541 assertEquals(fromString("S532"), fromString("Sownteks").soundex());
0542 assertEquals(fromString("T522"), fromString("Tymczak").soundex());
0543 assertEquals(fromString("V532"), fromString("VanDeusen").soundex());
0544 assertEquals(fromString("W252"), fromString("Washington").soundex());
0545 assertEquals(fromString("W350"), fromString("Wheaton").soundex());
0546
0547 assertEquals(fromString("A000"), fromString("a").soundex());
0548 assertEquals(fromString("A100"), fromString("ab").soundex());
0549 assertEquals(fromString("A120"), fromString("abc").soundex());
0550 assertEquals(fromString("A123"), fromString("abcd").soundex());
0551 assertEquals(fromString(""), fromString("").soundex());
0552 assertEquals(fromString("123"), fromString("123").soundex());
0553 assertEquals(fromString("世界千世"), fromString("世界千世").soundex());
0554 }
0555
0556 @Test
0557 public void writeToOutputStreamUnderflow() throws IOException {
0558
0559 final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0560 final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0561
0562 for (int i = 1; i <= Platform.BYTE_ARRAY_OFFSET; ++i) {
0563 UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET - i, test.length + i)
0564 .writeTo(outputStream);
0565 final ByteBuffer buffer = ByteBuffer.wrap(outputStream.toByteArray(), i, test.length);
0566 assertEquals("01234567", StandardCharsets.UTF_8.decode(buffer).toString());
0567 outputStream.reset();
0568 }
0569 }
0570
0571 @Test
0572 public void writeToOutputStreamSlice() throws IOException {
0573 final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0574 final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0575
0576 for (int i = 0; i < test.length; ++i) {
0577 for (int j = 0; j < test.length - i; ++j) {
0578 UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET + i, j)
0579 .writeTo(outputStream);
0580
0581 assertArrayEquals(Arrays.copyOfRange(test, i, i + j), outputStream.toByteArray());
0582 outputStream.reset();
0583 }
0584 }
0585 }
0586
0587 @Test
0588 public void writeToOutputStreamOverflow() throws IOException {
0589 final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0590 final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0591
0592 final HashSet<Long> offsets = new HashSet<>();
0593 for (int i = 0; i < 16; ++i) {
0594
0595 offsets.add((long) Integer.MAX_VALUE - i);
0596
0597
0598 offsets.add(Long.MAX_VALUE - BYTE_ARRAY_OFFSET - i);
0599 }
0600
0601 for (long i = 1; i > 0L; i <<= 1) {
0602 for (long j = 0; j < 32L; ++j) {
0603 offsets.add(i + j);
0604 }
0605 }
0606
0607 for (final long offset : offsets) {
0608 try {
0609 fromAddress(test, BYTE_ARRAY_OFFSET + offset, test.length)
0610 .writeTo(outputStream);
0611
0612 throw new IllegalStateException(Long.toString(offset));
0613 } catch (ArrayIndexOutOfBoundsException e) {
0614
0615 } finally {
0616 outputStream.reset();
0617 }
0618 }
0619 }
0620
0621 @Test
0622 public void writeToOutputStream() throws IOException {
0623 final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0624 EMPTY_UTF8.writeTo(outputStream);
0625 assertEquals("", outputStream.toString(StandardCharsets.UTF_8.name()));
0626 outputStream.reset();
0627
0628 fromString("数据砖很重").writeTo(outputStream);
0629 assertEquals(
0630 "数据砖很重",
0631 outputStream.toString(StandardCharsets.UTF_8.name()));
0632 outputStream.reset();
0633 }
0634
0635 @Test
0636 public void writeToOutputStreamIntArray() throws IOException {
0637
0638 final ByteBuffer buffer = StandardCharsets.UTF_8.encode("大千世界");
0639 buffer.position(0);
0640 buffer.order(ByteOrder.nativeOrder());
0641
0642 final int length = buffer.limit();
0643 assertEquals(12, length);
0644
0645 final int ints = length / 4;
0646 final int[] array = new int[ints];
0647
0648 for (int i = 0; i < ints; ++i) {
0649 array[i] = buffer.getInt();
0650 }
0651
0652 final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0653 fromAddress(array, Platform.INT_ARRAY_OFFSET, length)
0654 .writeTo(outputStream);
0655 assertEquals("大千世界", outputStream.toString(StandardCharsets.UTF_8.name()));
0656 }
0657
0658 @Test
0659 public void testToShort() throws IOException {
0660 Map<String, Short> inputToExpectedOutput = new HashMap<>();
0661 inputToExpectedOutput.put("1", (short) 1);
0662 inputToExpectedOutput.put("+1", (short) 1);
0663 inputToExpectedOutput.put("-1", (short) -1);
0664 inputToExpectedOutput.put("0", (short) 0);
0665 inputToExpectedOutput.put("1111.12345678901234567890", (short) 1111);
0666 inputToExpectedOutput.put(String.valueOf(Short.MAX_VALUE), Short.MAX_VALUE);
0667 inputToExpectedOutput.put(String.valueOf(Short.MIN_VALUE), Short.MIN_VALUE);
0668
0669 Random rand = new Random();
0670 for (int i = 0; i < 10; i++) {
0671 short value = (short) rand.nextInt();
0672 inputToExpectedOutput.put(String.valueOf(value), value);
0673 }
0674
0675 IntWrapper wrapper = new IntWrapper();
0676 for (Map.Entry<String, Short> entry : inputToExpectedOutput.entrySet()) {
0677 assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toShort(wrapper));
0678 assertEquals((short) entry.getValue(), wrapper.value);
0679 }
0680
0681 List<String> negativeInputs =
0682 Arrays.asList("", " ", "null", "NULL", "\n", "~1212121", "3276700");
0683
0684 for (String negativeInput : negativeInputs) {
0685 assertFalse(negativeInput, UTF8String.fromString(negativeInput).toShort(wrapper));
0686 }
0687 }
0688
0689 @Test
0690 public void testToByte() throws IOException {
0691 Map<String, Byte> inputToExpectedOutput = new HashMap<>();
0692 inputToExpectedOutput.put("1", (byte) 1);
0693 inputToExpectedOutput.put("+1",(byte) 1);
0694 inputToExpectedOutput.put("-1", (byte) -1);
0695 inputToExpectedOutput.put("0", (byte) 0);
0696 inputToExpectedOutput.put("111.12345678901234567890", (byte) 111);
0697 inputToExpectedOutput.put(String.valueOf(Byte.MAX_VALUE), Byte.MAX_VALUE);
0698 inputToExpectedOutput.put(String.valueOf(Byte.MIN_VALUE), Byte.MIN_VALUE);
0699
0700 Random rand = new Random();
0701 for (int i = 0; i < 10; i++) {
0702 byte value = (byte) rand.nextInt();
0703 inputToExpectedOutput.put(String.valueOf(value), value);
0704 }
0705
0706 IntWrapper intWrapper = new IntWrapper();
0707 for (Map.Entry<String, Byte> entry : inputToExpectedOutput.entrySet()) {
0708 assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toByte(intWrapper));
0709 assertEquals((byte) entry.getValue(), intWrapper.value);
0710 }
0711
0712 List<String> negativeInputs =
0713 Arrays.asList("", " ", "null", "NULL", "\n", "~1212121", "12345678901234567890");
0714
0715 for (String negativeInput : negativeInputs) {
0716 assertFalse(negativeInput, UTF8String.fromString(negativeInput).toByte(intWrapper));
0717 }
0718 }
0719
0720 @Test
0721 public void testToInt() throws IOException {
0722 Map<String, Integer> inputToExpectedOutput = new HashMap<>();
0723 inputToExpectedOutput.put("1", 1);
0724 inputToExpectedOutput.put("+1", 1);
0725 inputToExpectedOutput.put("-1", -1);
0726 inputToExpectedOutput.put("0", 0);
0727 inputToExpectedOutput.put("11111.1234567", 11111);
0728 inputToExpectedOutput.put(String.valueOf(Integer.MAX_VALUE), Integer.MAX_VALUE);
0729 inputToExpectedOutput.put(String.valueOf(Integer.MIN_VALUE), Integer.MIN_VALUE);
0730
0731 Random rand = new Random();
0732 for (int i = 0; i < 10; i++) {
0733 int value = rand.nextInt();
0734 inputToExpectedOutput.put(String.valueOf(value), value);
0735 }
0736
0737 IntWrapper intWrapper = new IntWrapper();
0738 for (Map.Entry<String, Integer> entry : inputToExpectedOutput.entrySet()) {
0739 assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toInt(intWrapper));
0740 assertEquals((int) entry.getValue(), intWrapper.value);
0741 }
0742
0743 List<String> negativeInputs =
0744 Arrays.asList("", " ", "null", "NULL", "\n", "~1212121", "12345678901234567890");
0745
0746 for (String negativeInput : negativeInputs) {
0747 assertFalse(negativeInput, UTF8String.fromString(negativeInput).toInt(intWrapper));
0748 }
0749 }
0750
0751 @Test
0752 public void testToLong() throws IOException {
0753 Map<String, Long> inputToExpectedOutput = new HashMap<>();
0754 inputToExpectedOutput.put("1", 1L);
0755 inputToExpectedOutput.put("+1", 1L);
0756 inputToExpectedOutput.put("-1", -1L);
0757 inputToExpectedOutput.put("0", 0L);
0758 inputToExpectedOutput.put("1076753423.12345678901234567890", 1076753423L);
0759 inputToExpectedOutput.put(String.valueOf(Long.MAX_VALUE), Long.MAX_VALUE);
0760 inputToExpectedOutput.put(String.valueOf(Long.MIN_VALUE), Long.MIN_VALUE);
0761
0762 Random rand = new Random();
0763 for (int i = 0; i < 10; i++) {
0764 long value = rand.nextLong();
0765 inputToExpectedOutput.put(String.valueOf(value), value);
0766 }
0767
0768 LongWrapper wrapper = new LongWrapper();
0769 for (Map.Entry<String, Long> entry : inputToExpectedOutput.entrySet()) {
0770 assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toLong(wrapper));
0771 assertEquals((long) entry.getValue(), wrapper.value);
0772 }
0773
0774 List<String> negativeInputs = Arrays.asList("", " ", "null", "NULL", "\n", "~1212121",
0775 "1234567890123456789012345678901234");
0776
0777 for (String negativeInput : negativeInputs) {
0778 assertFalse(negativeInput, UTF8String.fromString(negativeInput).toLong(wrapper));
0779 }
0780 }
0781
0782 @Test
0783 public void trimBothWithTrimString() {
0784 assertEquals(fromString("hello"), fromString(" hello ").trim(fromString(" ")));
0785 assertEquals(fromString("o"), fromString(" hello ").trim(fromString(" hle")));
0786 assertEquals(fromString("h e"), fromString("ooh e ooo").trim(fromString("o ")));
0787 assertEquals(fromString(""), fromString("ooo...oooo").trim(fromString("o.")));
0788 assertEquals(fromString("b"), fromString("%^b[]@").trim(fromString("][@^%")));
0789
0790 assertEquals(EMPTY_UTF8, fromString(" ").trim(fromString(" ")));
0791
0792 assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
0793 assertEquals(fromString("数"), fromString("a数b").trim(fromString("ab")));
0794 assertEquals(fromString(""), fromString("a").trim(fromString("a数b")));
0795 assertEquals(fromString(""), fromString("数数 数数数").trim(fromString("数 ")));
0796 assertEquals(fromString("据砖头"), fromString("数]数[数据砖头#数数").trim(fromString("[数]#")));
0797 assertEquals(fromString("据砖头数数 "), fromString("数数数据砖头数数 ").trim(fromString("数")));
0798 }
0799
0800 @Test
0801 public void trimLeftWithTrimString() {
0802 assertEquals(fromString(" hello "), fromString(" hello ").trimLeft(fromString("")));
0803 assertEquals(fromString(""), fromString("a").trimLeft(fromString("a")));
0804 assertEquals(fromString("b"), fromString("b").trimLeft(fromString("a")));
0805 assertEquals(fromString("ba"), fromString("ba").trimLeft(fromString("a")));
0806 assertEquals(fromString(""), fromString("aaaaaaa").trimLeft(fromString("a")));
0807 assertEquals(fromString("trim"), fromString("oabtrim").trimLeft(fromString("bao")));
0808 assertEquals(fromString("rim "), fromString("ooootrim ").trimLeft(fromString("otm")));
0809
0810 assertEquals(EMPTY_UTF8, fromString(" ").trimLeft(fromString(" ")));
0811
0812 assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft(fromString(" ")));
0813 assertEquals(fromString("数"), fromString("数").trimLeft(fromString("a")));
0814 assertEquals(fromString("a"), fromString("a").trimLeft(fromString("数")));
0815 assertEquals(fromString("砖头数数"), fromString("数数数据砖头数数").trimLeft(fromString("据数")));
0816 assertEquals(fromString("据砖头数数"), fromString(" 数数数据砖头数数").trimLeft(fromString("数 ")));
0817 assertEquals(fromString("据砖头数数"), fromString("aa数数数据砖头数数").trimLeft(fromString("a数砖")));
0818 assertEquals(fromString("$S,.$BR"), fromString(",,,,%$S,.$BR").trimLeft(fromString("%,")));
0819 }
0820
0821 @Test
0822 public void trimRightWithTrimString() {
0823 assertEquals(fromString(" hello "), fromString(" hello ").trimRight(fromString("")));
0824 assertEquals(fromString(""), fromString("a").trimRight(fromString("a")));
0825 assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
0826 assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
0827 assertEquals(fromString(" he"), fromString(" hello ").trimRight(fromString(" ol")));
0828 assertEquals(fromString("oohell"),
0829 fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
0830
0831 assertEquals(EMPTY_UTF8, fromString(" ").trimRight(fromString(" ")));
0832
0833 assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight(fromString(" ")));
0834 assertEquals(fromString("数数砖头"), fromString("数数砖头数aa数").trimRight(fromString("a数")));
0835 assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
0836 assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
0837 assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
0838 }
0839
0840 @Test
0841 public void skipWrongFirstByte() {
0842 int[] wrongFirstBytes = {
0843 0x80, 0x9F, 0xBF,
0844 0xC0, 0xC2,
0845
0846 0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
0847 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
0848 };
0849 byte[] c = new byte[1];
0850
0851 for (int i = 0; i < wrongFirstBytes.length; ++i) {
0852 c[0] = (byte)wrongFirstBytes[i];
0853 assertEquals(1, fromBytes(c).numChars());
0854 }
0855 }
0856 }