Back to home page

OSCL-LXR

 
 

    


0001 /*
0002 * Licensed to the Apache Software Foundation (ASF) under one or more
0003 * contributor license agreements.  See the NOTICE file distributed with
0004 * this work for additional information regarding copyright ownership.
0005 * The ASF licenses this file to You under the Apache License, Version 2.0
0006 * (the "License"); you may not use this file except in compliance with
0007 * the License.  You may obtain a copy of the License at
0008 *
0009 *    http://www.apache.org/licenses/LICENSE-2.0
0010 *
0011 * Unless required by applicable law or agreed to in writing, software
0012 * distributed under the License is distributed on an "AS IS" BASIS,
0013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014 * See the License for the specific language governing permissions and
0015 * limitations under the License.
0016 */
0017 
0018 package org.apache.spark.unsafe.types;
0019 
0020 import java.io.ByteArrayOutputStream;
0021 import java.io.IOException;
0022 import java.nio.ByteBuffer;
0023 import java.nio.ByteOrder;
0024 import java.nio.charset.StandardCharsets;
0025 import java.util.*;
0026 
0027 import com.google.common.collect.ImmutableMap;
0028 import org.apache.spark.unsafe.Platform;
0029 import org.junit.Test;
0030 
0031 import static org.junit.Assert.*;
0032 
0033 import static org.apache.spark.unsafe.Platform.BYTE_ARRAY_OFFSET;
0034 import static org.apache.spark.unsafe.types.UTF8String.*;
0035 
0036 public class UTF8StringSuite {
0037 
0038   private static void checkBasic(String str, int len) {
0039     UTF8String s1 = fromString(str);
0040     UTF8String s2 = fromBytes(str.getBytes(StandardCharsets.UTF_8));
0041     assertEquals(len, s1.numChars());
0042     assertEquals(len, s2.numChars());
0043 
0044     assertEquals(str, s1.toString());
0045     assertEquals(str, s2.toString());
0046     assertEquals(s1, s2);
0047 
0048     assertEquals(s1.hashCode(), s2.hashCode());
0049 
0050     assertEquals(0, s1.compareTo(s2));
0051 
0052     assertTrue(s1.contains(s2));
0053     assertTrue(s2.contains(s1));
0054     assertTrue(s1.startsWith(s2));
0055     assertTrue(s1.endsWith(s2));
0056   }
0057 
0058   @Test
0059   public void basicTest() {
0060     checkBasic("", 0);
0061     checkBasic("¡", 1); // 2 bytes char
0062     checkBasic("ку", 2); // 2 * 2 bytes chars
0063     checkBasic("hello", 5); // 5 * 1 byte chars
0064     checkBasic("大 千 世 界", 7);
0065     checkBasic("︽﹋%", 3); // 3 * 3 bytes chars
0066     checkBasic("\uD83E\uDD19", 1); // 4 bytes char
0067   }
0068 
0069   @Test
0070   public void emptyStringTest() {
0071     assertEquals(EMPTY_UTF8, fromString(""));
0072     assertEquals(EMPTY_UTF8, fromBytes(new byte[0]));
0073     assertEquals(0, EMPTY_UTF8.numChars());
0074     assertEquals(0, EMPTY_UTF8.numBytes());
0075   }
0076 
0077   @Test
0078   public void prefix() {
0079     assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
0080     assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
0081     assertTrue(
0082       fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
0083     assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
0084     assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
0085 
0086     byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
0087     byte[] buf2 = {1, 2, 3};
0088     UTF8String str1 = fromBytes(buf1, 0, 3);
0089     UTF8String str2 = fromBytes(buf1, 0, 8);
0090     UTF8String str3 = fromBytes(buf2);
0091     assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
0092     assertEquals(str1.getPrefix(), str3.getPrefix());
0093   }
0094 
0095   @Test
0096   public void compareTo() {
0097     assertTrue(fromString("").compareTo(fromString("a")) < 0);
0098     assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
0099     assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
0100     assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
0101     assertTrue(fromString("aBcabcabc").compareTo(fromString("Abcabcabc")) > 0);
0102     assertTrue(fromString("Abcabcabc").compareTo(fromString("abcabcabC")) < 0);
0103     assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabC")) > 0);
0104 
0105     assertTrue(fromString("abc").compareTo(fromString("世界")) < 0);
0106     assertTrue(fromString("你好").compareTo(fromString("世界")) > 0);
0107     assertTrue(fromString("你好123").compareTo(fromString("你好122")) > 0);
0108   }
0109 
0110   protected static void testUpperandLower(String upper, String lower) {
0111     UTF8String us = fromString(upper);
0112     UTF8String ls = fromString(lower);
0113     assertEquals(ls, us.toLowerCase());
0114     assertEquals(us, ls.toUpperCase());
0115     assertEquals(us, us.toUpperCase());
0116     assertEquals(ls, ls.toLowerCase());
0117   }
0118 
0119   @Test
0120   public void upperAndLower() {
0121     testUpperandLower("", "");
0122     testUpperandLower("0123456", "0123456");
0123     testUpperandLower("ABCXYZ", "abcxyz");
0124     testUpperandLower("ЀЁЂѺΏỀ", "ѐёђѻώề");
0125     testUpperandLower("大千世界 数据砖头", "大千世界 数据砖头");
0126   }
0127 
0128   @Test
0129   public void titleCase() {
0130     assertEquals(fromString(""), fromString("").toTitleCase());
0131     assertEquals(fromString("Ab Bc Cd"), fromString("ab bc cd").toTitleCase());
0132     assertEquals(fromString("Ѐ Ё Ђ Ѻ Ώ Ề"), fromString("ѐ ё ђ ѻ ώ ề").toTitleCase());
0133     assertEquals(fromString("大千世界 数据砖头"), fromString("大千世界 数据砖头").toTitleCase());
0134   }
0135 
0136   @Test
0137   public void concatTest() {
0138     assertEquals(EMPTY_UTF8, concat());
0139     assertNull(concat((UTF8String) null));
0140     assertEquals(EMPTY_UTF8, concat(EMPTY_UTF8));
0141     assertEquals(fromString("ab"), concat(fromString("ab")));
0142     assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
0143     assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
0144     assertNull(concat(fromString("a"), null, fromString("c")));
0145     assertNull(concat(fromString("a"), null, null));
0146     assertNull(concat(null, null, null));
0147     assertEquals(fromString("数据砖头"), concat(fromString("数据"), fromString("砖头")));
0148   }
0149 
0150   @Test
0151   public void concatWsTest() {
0152     // Returns null if the separator is null
0153     assertNull(concatWs(null, (UTF8String) null));
0154     assertNull(concatWs(null, fromString("a")));
0155 
0156     // If separator is null, concatWs should skip all null inputs and never return null.
0157     UTF8String sep = fromString("哈哈");
0158     assertEquals(
0159       EMPTY_UTF8,
0160       concatWs(sep, EMPTY_UTF8));
0161     assertEquals(
0162       fromString("ab"),
0163       concatWs(sep, fromString("ab")));
0164     assertEquals(
0165       fromString("a哈哈b"),
0166       concatWs(sep, fromString("a"), fromString("b")));
0167     assertEquals(
0168       fromString("a哈哈b哈哈c"),
0169       concatWs(sep, fromString("a"), fromString("b"), fromString("c")));
0170     assertEquals(
0171       fromString("a哈哈c"),
0172       concatWs(sep, fromString("a"), null, fromString("c")));
0173     assertEquals(
0174       fromString("a"),
0175       concatWs(sep, fromString("a"), null, null));
0176     assertEquals(
0177       EMPTY_UTF8,
0178       concatWs(sep, null, null, null));
0179     assertEquals(
0180       fromString("数据哈哈砖头"),
0181       concatWs(sep, fromString("数据"), fromString("砖头")));
0182   }
0183 
0184   @Test
0185   public void contains() {
0186     assertTrue(EMPTY_UTF8.contains(EMPTY_UTF8));
0187     assertTrue(fromString("hello").contains(fromString("ello")));
0188     assertFalse(fromString("hello").contains(fromString("vello")));
0189     assertFalse(fromString("hello").contains(fromString("hellooo")));
0190     assertTrue(fromString("大千世界").contains(fromString("千世界")));
0191     assertFalse(fromString("大千世界").contains(fromString("世千")));
0192     assertFalse(fromString("大千世界").contains(fromString("大千世界好")));
0193   }
0194 
0195   @Test
0196   public void startsWith() {
0197     assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8));
0198     assertTrue(fromString("hello").startsWith(fromString("hell")));
0199     assertFalse(fromString("hello").startsWith(fromString("ell")));
0200     assertFalse(fromString("hello").startsWith(fromString("hellooo")));
0201     assertTrue(fromString("数据砖头").startsWith(fromString("数据")));
0202     assertFalse(fromString("大千世界").startsWith(fromString("千")));
0203     assertFalse(fromString("大千世界").startsWith(fromString("大千世界好")));
0204   }
0205 
0206   @Test
0207   public void endsWith() {
0208     assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8));
0209     assertTrue(fromString("hello").endsWith(fromString("ello")));
0210     assertFalse(fromString("hello").endsWith(fromString("ellov")));
0211     assertFalse(fromString("hello").endsWith(fromString("hhhello")));
0212     assertTrue(fromString("大千世界").endsWith(fromString("世界")));
0213     assertFalse(fromString("大千世界").endsWith(fromString("世")));
0214     assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头")));
0215   }
0216 
0217   @Test
0218   public void substring() {
0219     assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));
0220     assertEquals(fromString("el"), fromString("hello").substring(1, 3));
0221     assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
0222     assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
0223     assertEquals(fromString("头"), fromString("数据砖头").substring(3, 5));
0224     assertEquals(fromString("ߵ梷"), fromString("ߵ梷").substring(0, 2));
0225   }
0226 
0227   @Test
0228   public void trims() {
0229     assertEquals(fromString("1"), fromString("1").trim());
0230     assertEquals(fromString("1"), fromString("1\t").trimAll());
0231 
0232     assertEquals(fromString("hello"), fromString("  hello ").trim());
0233     assertEquals(fromString("hello "), fromString("  hello ").trimLeft());
0234     assertEquals(fromString("  hello"), fromString("  hello ").trimRight());
0235 
0236     assertEquals(EMPTY_UTF8, EMPTY_UTF8.trim());
0237     assertEquals(EMPTY_UTF8, fromString("  ").trim());
0238     assertEquals(EMPTY_UTF8, fromString("  ").trimLeft());
0239     assertEquals(EMPTY_UTF8, fromString("  ").trimRight());
0240 
0241     assertEquals(fromString("数据砖头"), fromString("  数据砖头 ").trim());
0242     assertEquals(fromString("数据砖头 "), fromString("  数据砖头 ").trimLeft());
0243     assertEquals(fromString("  数据砖头"), fromString("  数据砖头 ").trimRight());
0244 
0245     assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
0246     assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
0247     assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
0248 
0249     char[] charsLessThan0x20 = new char[10];
0250     Arrays.fill(charsLessThan0x20, (char)(' ' - 1));
0251     String stringStartingWithSpace =
0252       new String(charsLessThan0x20) + "hello" + new String(charsLessThan0x20);
0253     assertEquals(fromString(stringStartingWithSpace), fromString(stringStartingWithSpace).trim());
0254     assertEquals(fromString(stringStartingWithSpace),
0255       fromString(stringStartingWithSpace).trimLeft());
0256     assertEquals(fromString(stringStartingWithSpace),
0257       fromString(stringStartingWithSpace).trimRight());
0258   }
0259 
0260   @Test
0261   public void indexOf() {
0262     assertEquals(0, EMPTY_UTF8.indexOf(EMPTY_UTF8, 0));
0263     assertEquals(-1, EMPTY_UTF8.indexOf(fromString("l"), 0));
0264     assertEquals(0, fromString("hello").indexOf(EMPTY_UTF8, 0));
0265     assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
0266     assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
0267     assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
0268     assertEquals(2, fromString("hello").indexOf(fromString("ll"), 0));
0269     assertEquals(-1, fromString("hello").indexOf(fromString("ll"), 4));
0270     assertEquals(1, fromString("数据砖头").indexOf(fromString("据砖"), 0));
0271     assertEquals(-1, fromString("数据砖头").indexOf(fromString("数"), 3));
0272     assertEquals(0, fromString("数据砖头").indexOf(fromString("数"), 0));
0273     assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
0274   }
0275 
0276   @Test
0277   public void substring_index() {
0278     assertEquals(fromString("www.apache.org"),
0279       fromString("www.apache.org").subStringIndex(fromString("."), 3));
0280     assertEquals(fromString("www.apache"),
0281       fromString("www.apache.org").subStringIndex(fromString("."), 2));
0282     assertEquals(fromString("www"),
0283       fromString("www.apache.org").subStringIndex(fromString("."), 1));
0284     assertEquals(fromString(""),
0285       fromString("www.apache.org").subStringIndex(fromString("."), 0));
0286     assertEquals(fromString("org"),
0287       fromString("www.apache.org").subStringIndex(fromString("."), -1));
0288     assertEquals(fromString("apache.org"),
0289       fromString("www.apache.org").subStringIndex(fromString("."), -2));
0290     assertEquals(fromString("www.apache.org"),
0291       fromString("www.apache.org").subStringIndex(fromString("."), -3));
0292     // str is empty string
0293     assertEquals(fromString(""),
0294       fromString("").subStringIndex(fromString("."), 1));
0295     // empty string delim
0296     assertEquals(fromString(""),
0297       fromString("www.apache.org").subStringIndex(fromString(""), 1));
0298     // delim does not exist in str
0299     assertEquals(fromString("www.apache.org"),
0300       fromString("www.apache.org").subStringIndex(fromString("#"), 2));
0301     // delim is 2 chars
0302     assertEquals(fromString("www||apache"),
0303       fromString("www||apache||org").subStringIndex(fromString("||"), 2));
0304     assertEquals(fromString("apache||org"),
0305       fromString("www||apache||org").subStringIndex(fromString("||"), -2));
0306     // non ascii chars
0307     assertEquals(fromString("大千世界大"),
0308       fromString("大千世界大千世界").subStringIndex(fromString("千"), 2));
0309     // overlapped delim
0310     assertEquals(fromString("||"), fromString("||||||").subStringIndex(fromString("|||"), 3));
0311     assertEquals(fromString("|||"), fromString("||||||").subStringIndex(fromString("|||"), -4));
0312   }
0313 
0314   @Test
0315   public void reverse() {
0316     assertEquals(fromString("olleh"), fromString("hello").reverse());
0317     assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
0318     assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
0319     assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
0320   }
0321 
0322   @Test
0323   public void repeat() {
0324     assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
0325     assertEquals(fromString("数d"), fromString("数d").repeat(1));
0326     assertEquals(EMPTY_UTF8, fromString("数d").repeat(-1));
0327   }
0328 
0329   @Test
0330   public void pad() {
0331     assertEquals(fromString("hel"), fromString("hello").lpad(3, fromString("????")));
0332     assertEquals(fromString("hello"), fromString("hello").lpad(5, fromString("????")));
0333     assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
0334     assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
0335     assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
0336     assertEquals(fromString("???????"), EMPTY_UTF8.lpad(7, fromString("?????")));
0337 
0338     assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
0339     assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
0340     assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
0341     assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
0342     assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
0343     assertEquals(fromString("???????"), EMPTY_UTF8.rpad(7, fromString("?????")));
0344 
0345     assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
0346     assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
0347     assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
0348     assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
0349     assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
0350     assertEquals(
0351       fromString("孙行者孙行者孙行数据砖头"),
0352       fromString("数据砖头").lpad(12, fromString("孙行者")));
0353 
0354     assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
0355     assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
0356     assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
0357     assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
0358     assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
0359     assertEquals(
0360       fromString("数据砖头孙行者孙行者孙行"),
0361       fromString("数据砖头").rpad(12, fromString("孙行者")));
0362 
0363     assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, fromString("孙行者")));
0364     assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, EMPTY_UTF8));
0365     assertEquals(fromString("数据砖头"), fromString("数据砖头").lpad(5, EMPTY_UTF8));
0366     assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, EMPTY_UTF8));
0367     assertEquals(EMPTY_UTF8, EMPTY_UTF8.lpad(3, EMPTY_UTF8));
0368 
0369     assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, fromString("孙行者")));
0370     assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, EMPTY_UTF8));
0371     assertEquals(fromString("数据砖头"), fromString("数据砖头").rpad(5, EMPTY_UTF8));
0372     assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, EMPTY_UTF8));
0373     assertEquals(EMPTY_UTF8, EMPTY_UTF8.rpad(3, EMPTY_UTF8));
0374   }
0375 
0376   @Test
0377   public void substringSQL() {
0378     UTF8String e = fromString("example");
0379     assertEquals(fromString("ex"), e.substringSQL(0, 2));
0380     assertEquals(fromString("ex"), e.substringSQL(1, 2));
0381     assertEquals(fromString("example"), e.substringSQL(0, 7));
0382     assertEquals(fromString("ex"), e.substringSQL(1, 2));
0383     assertEquals(fromString("example"), e.substringSQL(0, 100));
0384     assertEquals(fromString("example"), e.substringSQL(1, 100));
0385     assertEquals(fromString("xa"), e.substringSQL(2, 2));
0386     assertEquals(fromString("exampl"), e.substringSQL(1, 6));
0387     assertEquals(fromString("xample"), e.substringSQL(2, 100));
0388     assertEquals(fromString(""), e.substringSQL(0, 0));
0389     assertEquals(EMPTY_UTF8, e.substringSQL(100, 4));
0390     assertEquals(fromString("example"), e.substringSQL(0, Integer.MAX_VALUE));
0391     assertEquals(fromString("example"), e.substringSQL(1, Integer.MAX_VALUE));
0392     assertEquals(fromString("xample"), e.substringSQL(2, Integer.MAX_VALUE));
0393   }
0394 
0395   @Test
0396   public void split() {
0397     UTF8String[] negativeAndZeroLimitCase =
0398       new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi"), fromString("")};
0399     assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 0),
0400       negativeAndZeroLimitCase));
0401     assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), -1),
0402       negativeAndZeroLimitCase));
0403     assertTrue(Arrays.equals(fromString("ab,def,ghi,").split(fromString(","), 2),
0404       new UTF8String[]{fromString("ab"), fromString("def,ghi,")}));
0405   }
0406 
0407   @Test
0408   public void replace() {
0409     assertEquals(
0410       fromString("re123ace"),
0411       fromString("replace").replace(fromString("pl"), fromString("123")));
0412     assertEquals(
0413       fromString("reace"),
0414       fromString("replace").replace(fromString("pl"), fromString("")));
0415     assertEquals(
0416       fromString("replace"),
0417       fromString("replace").replace(fromString(""), fromString("123")));
0418     // tests for multiple replacements
0419     assertEquals(
0420       fromString("a12ca12c"),
0421       fromString("abcabc").replace(fromString("b"), fromString("12")));
0422     assertEquals(
0423       fromString("adad"),
0424       fromString("abcdabcd").replace(fromString("bc"), fromString("")));
0425     // tests for single character search and replacement strings
0426     assertEquals(
0427       fromString("AbcAbc"),
0428       fromString("abcabc").replace(fromString("a"), fromString("A")));
0429     assertEquals(
0430       fromString("abcabc"),
0431       fromString("abcabc").replace(fromString("Z"), fromString("A")));
0432     // Tests with non-ASCII characters
0433     assertEquals(
0434       fromString("花ab界"),
0435       fromString("花花世界").replace(fromString("花世"), fromString("ab")));
0436     assertEquals(
0437       fromString("a水c"),
0438       fromString("a火c").replace(fromString("火"), fromString("水")));
0439     // Tests for a large number of replacements, triggering UTF8StringBuilder resize
0440     assertEquals(
0441       fromString("abcd").repeat(17),
0442       fromString("a").repeat(17).replace(fromString("a"), fromString("abcd")));
0443   }
0444 
0445   @Test
0446   public void levenshteinDistance() {
0447     assertEquals(0, EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8));
0448     assertEquals(1, EMPTY_UTF8.levenshteinDistance(fromString("a")));
0449     assertEquals(7, fromString("aaapppp").levenshteinDistance(EMPTY_UTF8));
0450     assertEquals(1, fromString("frog").levenshteinDistance(fromString("fog")));
0451     assertEquals(3, fromString("fly").levenshteinDistance(fromString("ant")));
0452     assertEquals(7, fromString("elephant").levenshteinDistance(fromString("hippo")));
0453     assertEquals(7, fromString("hippo").levenshteinDistance(fromString("elephant")));
0454     assertEquals(8, fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")));
0455     assertEquals(1, fromString("hello").levenshteinDistance(fromString("hallo")));
0456     assertEquals(4, fromString("世界千世").levenshteinDistance(fromString("千a世b")));
0457   }
0458 
0459   @Test
0460   public void translate() {
0461     assertEquals(
0462       fromString("1a2s3ae"),
0463       fromString("translate").translate(ImmutableMap.of(
0464         'r', '1',
0465         'n', '2',
0466         'l', '3',
0467         't', '\0'
0468       )));
0469     assertEquals(
0470       fromString("translate"),
0471       fromString("translate").translate(new HashMap<>()));
0472     assertEquals(
0473       fromString("asae"),
0474       fromString("translate").translate(ImmutableMap.of(
0475         'r', '\0',
0476         'n', '\0',
0477         'l', '\0',
0478         't', '\0'
0479       )));
0480     assertEquals(
0481       fromString("aa世b"),
0482       fromString("花花世界").translate(ImmutableMap.of(
0483         '花', 'a',
0484         '界', 'b'
0485       )));
0486   }
0487 
0488   @Test
0489   public void createBlankString() {
0490     assertEquals(fromString(" "), blankString(1));
0491     assertEquals(fromString("  "), blankString(2));
0492     assertEquals(fromString("   "), blankString(3));
0493     assertEquals(fromString(""), blankString(0));
0494   }
0495 
0496   @Test
0497   public void findInSet() {
0498     assertEquals(1, fromString("ab").findInSet(fromString("ab")));
0499     assertEquals(2, fromString("a,b").findInSet(fromString("b")));
0500     assertEquals(3, fromString("abc,b,ab,c,def").findInSet(fromString("ab")));
0501     assertEquals(1, fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")));
0502     assertEquals(4, fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")));
0503     assertEquals(1, fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")));
0504     assertEquals(4, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")));
0505     assertEquals(6, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")));
0506   }
0507 
0508   @Test
0509   public void soundex() {
0510     assertEquals(fromString("R163"), fromString("Robert").soundex());
0511     assertEquals(fromString("R163"), fromString("Rupert").soundex());
0512     assertEquals(fromString("R150"), fromString("Rubin").soundex());
0513     assertEquals(fromString("A261"), fromString("Ashcraft").soundex());
0514     assertEquals(fromString("A261"), fromString("Ashcroft").soundex());
0515     assertEquals(fromString("B620"), fromString("Burroughs").soundex());
0516     assertEquals(fromString("B620"), fromString("Burrows").soundex());
0517     assertEquals(fromString("E251"), fromString("Ekzampul").soundex());
0518     assertEquals(fromString("E251"), fromString("Example").soundex());
0519     assertEquals(fromString("E460"), fromString("Ellery").soundex());
0520     assertEquals(fromString("E460"), fromString("Euler").soundex());
0521     assertEquals(fromString("G200"), fromString("Ghosh").soundex());
0522     assertEquals(fromString("G200"), fromString("Gauss").soundex());
0523     assertEquals(fromString("G362"), fromString("Gutierrez").soundex());
0524     assertEquals(fromString("H416"), fromString("Heilbronn").soundex());
0525     assertEquals(fromString("H416"), fromString("Hilbert").soundex());
0526     assertEquals(fromString("J250"), fromString("Jackson").soundex());
0527     assertEquals(fromString("K530"), fromString("Kant").soundex());
0528     assertEquals(fromString("K530"), fromString("Knuth").soundex());
0529     assertEquals(fromString("L000"), fromString("Lee").soundex());
0530     assertEquals(fromString("L222"), fromString("Lukasiewicz").soundex());
0531     assertEquals(fromString("L222"), fromString("Lissajous").soundex());
0532     assertEquals(fromString("L300"), fromString("Ladd").soundex());
0533     assertEquals(fromString("L300"), fromString("Lloyd").soundex());
0534     assertEquals(fromString("M220"), fromString("Moses").soundex());
0535     assertEquals(fromString("O600"), fromString("O'Hara").soundex());
0536     assertEquals(fromString("P236"), fromString("Pfister").soundex());
0537     assertEquals(fromString("R150"), fromString("Rubin").soundex());
0538     assertEquals(fromString("R163"), fromString("Robert").soundex());
0539     assertEquals(fromString("R163"), fromString("Rupert").soundex());
0540     assertEquals(fromString("S532"), fromString("Soundex").soundex());
0541     assertEquals(fromString("S532"), fromString("Sownteks").soundex());
0542     assertEquals(fromString("T522"), fromString("Tymczak").soundex());
0543     assertEquals(fromString("V532"), fromString("VanDeusen").soundex());
0544     assertEquals(fromString("W252"), fromString("Washington").soundex());
0545     assertEquals(fromString("W350"), fromString("Wheaton").soundex());
0546 
0547     assertEquals(fromString("A000"), fromString("a").soundex());
0548     assertEquals(fromString("A100"), fromString("ab").soundex());
0549     assertEquals(fromString("A120"), fromString("abc").soundex());
0550     assertEquals(fromString("A123"), fromString("abcd").soundex());
0551     assertEquals(fromString(""), fromString("").soundex());
0552     assertEquals(fromString("123"), fromString("123").soundex());
0553     assertEquals(fromString("世界千世"), fromString("世界千世").soundex());
0554   }
0555 
0556   @Test
0557   public void writeToOutputStreamUnderflow() throws IOException {
0558     // offset underflow is apparently supported?
0559     final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0560     final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0561 
0562     for (int i = 1; i <= Platform.BYTE_ARRAY_OFFSET; ++i) {
0563       UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET - i, test.length + i)
0564           .writeTo(outputStream);
0565       final ByteBuffer buffer = ByteBuffer.wrap(outputStream.toByteArray(), i, test.length);
0566       assertEquals("01234567", StandardCharsets.UTF_8.decode(buffer).toString());
0567       outputStream.reset();
0568     }
0569   }
0570 
0571   @Test
0572   public void writeToOutputStreamSlice() throws IOException {
0573     final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0574     final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0575 
0576     for (int i = 0; i < test.length; ++i) {
0577       for (int j = 0; j < test.length - i; ++j) {
0578         UTF8String.fromAddress(test, Platform.BYTE_ARRAY_OFFSET + i, j)
0579             .writeTo(outputStream);
0580 
0581         assertArrayEquals(Arrays.copyOfRange(test, i, i + j), outputStream.toByteArray());
0582         outputStream.reset();
0583       }
0584     }
0585   }
0586 
0587   @Test
0588   public void writeToOutputStreamOverflow() throws IOException {
0589     final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0590     final byte[] test = "01234567".getBytes(StandardCharsets.UTF_8);
0591 
0592     final HashSet<Long> offsets = new HashSet<>();
0593     for (int i = 0; i < 16; ++i) {
0594       // touch more points around MAX_VALUE
0595       offsets.add((long) Integer.MAX_VALUE - i);
0596       // subtract off BYTE_ARRAY_OFFSET to avoid wrapping around to a negative value,
0597       // which will hit the slower copy path instead of the optimized one
0598       offsets.add(Long.MAX_VALUE - BYTE_ARRAY_OFFSET - i);
0599     }
0600 
0601     for (long i = 1; i > 0L; i <<= 1) {
0602       for (long j = 0; j < 32L; ++j) {
0603         offsets.add(i + j);
0604       }
0605     }
0606 
0607     for (final long offset : offsets) {
0608       try {
0609         fromAddress(test, BYTE_ARRAY_OFFSET + offset, test.length)
0610             .writeTo(outputStream);
0611 
0612         throw new IllegalStateException(Long.toString(offset));
0613       } catch (ArrayIndexOutOfBoundsException e) {
0614         // ignore
0615       } finally {
0616         outputStream.reset();
0617       }
0618     }
0619   }
0620 
0621   @Test
0622   public void writeToOutputStream() throws IOException {
0623     final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0624     EMPTY_UTF8.writeTo(outputStream);
0625     assertEquals("", outputStream.toString(StandardCharsets.UTF_8.name()));
0626     outputStream.reset();
0627 
0628     fromString("数据砖很重").writeTo(outputStream);
0629     assertEquals(
0630         "数据砖很重",
0631         outputStream.toString(StandardCharsets.UTF_8.name()));
0632     outputStream.reset();
0633   }
0634 
0635   @Test
0636   public void writeToOutputStreamIntArray() throws IOException {
0637     // verify that writes work on objects that are not byte arrays
0638     final ByteBuffer buffer = StandardCharsets.UTF_8.encode("大千世界");
0639     buffer.position(0);
0640     buffer.order(ByteOrder.nativeOrder());
0641 
0642     final int length = buffer.limit();
0643     assertEquals(12, length);
0644 
0645     final int ints = length / 4;
0646     final int[] array = new int[ints];
0647 
0648     for (int i = 0; i < ints; ++i) {
0649       array[i] = buffer.getInt();
0650     }
0651 
0652     final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
0653     fromAddress(array, Platform.INT_ARRAY_OFFSET, length)
0654         .writeTo(outputStream);
0655     assertEquals("大千世界", outputStream.toString(StandardCharsets.UTF_8.name()));
0656   }
0657 
0658   @Test
0659   public void testToShort() throws IOException {
0660     Map<String, Short> inputToExpectedOutput = new HashMap<>();
0661     inputToExpectedOutput.put("1", (short) 1);
0662     inputToExpectedOutput.put("+1", (short) 1);
0663     inputToExpectedOutput.put("-1", (short) -1);
0664     inputToExpectedOutput.put("0", (short) 0);
0665     inputToExpectedOutput.put("1111.12345678901234567890", (short) 1111);
0666     inputToExpectedOutput.put(String.valueOf(Short.MAX_VALUE), Short.MAX_VALUE);
0667     inputToExpectedOutput.put(String.valueOf(Short.MIN_VALUE), Short.MIN_VALUE);
0668 
0669     Random rand = new Random();
0670     for (int i = 0; i < 10; i++) {
0671       short value = (short) rand.nextInt();
0672       inputToExpectedOutput.put(String.valueOf(value), value);
0673     }
0674 
0675     IntWrapper wrapper = new IntWrapper();
0676     for (Map.Entry<String, Short> entry : inputToExpectedOutput.entrySet()) {
0677       assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toShort(wrapper));
0678       assertEquals((short) entry.getValue(), wrapper.value);
0679     }
0680 
0681     List<String> negativeInputs =
0682       Arrays.asList("", "  ", "null", "NULL", "\n", "~1212121", "3276700");
0683 
0684     for (String negativeInput : negativeInputs) {
0685       assertFalse(negativeInput, UTF8String.fromString(negativeInput).toShort(wrapper));
0686     }
0687   }
0688 
0689   @Test
0690   public void testToByte() throws IOException {
0691     Map<String, Byte> inputToExpectedOutput = new HashMap<>();
0692     inputToExpectedOutput.put("1", (byte) 1);
0693     inputToExpectedOutput.put("+1",(byte)  1);
0694     inputToExpectedOutput.put("-1", (byte)  -1);
0695     inputToExpectedOutput.put("0", (byte)  0);
0696     inputToExpectedOutput.put("111.12345678901234567890", (byte) 111);
0697     inputToExpectedOutput.put(String.valueOf(Byte.MAX_VALUE), Byte.MAX_VALUE);
0698     inputToExpectedOutput.put(String.valueOf(Byte.MIN_VALUE), Byte.MIN_VALUE);
0699 
0700     Random rand = new Random();
0701     for (int i = 0; i < 10; i++) {
0702       byte value = (byte) rand.nextInt();
0703       inputToExpectedOutput.put(String.valueOf(value), value);
0704     }
0705 
0706     IntWrapper intWrapper = new IntWrapper();
0707     for (Map.Entry<String, Byte> entry : inputToExpectedOutput.entrySet()) {
0708       assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toByte(intWrapper));
0709       assertEquals((byte) entry.getValue(), intWrapper.value);
0710     }
0711 
0712     List<String> negativeInputs =
0713       Arrays.asList("", "  ", "null", "NULL", "\n", "~1212121", "12345678901234567890");
0714 
0715     for (String negativeInput : negativeInputs) {
0716       assertFalse(negativeInput, UTF8String.fromString(negativeInput).toByte(intWrapper));
0717     }
0718   }
0719 
0720   @Test
0721   public void testToInt() throws IOException {
0722     Map<String, Integer> inputToExpectedOutput = new HashMap<>();
0723     inputToExpectedOutput.put("1", 1);
0724     inputToExpectedOutput.put("+1", 1);
0725     inputToExpectedOutput.put("-1", -1);
0726     inputToExpectedOutput.put("0", 0);
0727     inputToExpectedOutput.put("11111.1234567", 11111);
0728     inputToExpectedOutput.put(String.valueOf(Integer.MAX_VALUE), Integer.MAX_VALUE);
0729     inputToExpectedOutput.put(String.valueOf(Integer.MIN_VALUE), Integer.MIN_VALUE);
0730 
0731     Random rand = new Random();
0732     for (int i = 0; i < 10; i++) {
0733       int value = rand.nextInt();
0734       inputToExpectedOutput.put(String.valueOf(value), value);
0735     }
0736 
0737     IntWrapper intWrapper = new IntWrapper();
0738     for (Map.Entry<String, Integer> entry : inputToExpectedOutput.entrySet()) {
0739       assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toInt(intWrapper));
0740       assertEquals((int) entry.getValue(), intWrapper.value);
0741     }
0742 
0743     List<String> negativeInputs =
0744       Arrays.asList("", "  ", "null", "NULL", "\n", "~1212121", "12345678901234567890");
0745 
0746     for (String negativeInput : negativeInputs) {
0747       assertFalse(negativeInput, UTF8String.fromString(negativeInput).toInt(intWrapper));
0748     }
0749   }
0750 
0751   @Test
0752   public void testToLong() throws IOException {
0753     Map<String, Long> inputToExpectedOutput = new HashMap<>();
0754     inputToExpectedOutput.put("1", 1L);
0755     inputToExpectedOutput.put("+1", 1L);
0756     inputToExpectedOutput.put("-1", -1L);
0757     inputToExpectedOutput.put("0", 0L);
0758     inputToExpectedOutput.put("1076753423.12345678901234567890", 1076753423L);
0759     inputToExpectedOutput.put(String.valueOf(Long.MAX_VALUE), Long.MAX_VALUE);
0760     inputToExpectedOutput.put(String.valueOf(Long.MIN_VALUE), Long.MIN_VALUE);
0761 
0762     Random rand = new Random();
0763     for (int i = 0; i < 10; i++) {
0764       long value = rand.nextLong();
0765       inputToExpectedOutput.put(String.valueOf(value), value);
0766     }
0767 
0768     LongWrapper wrapper = new LongWrapper();
0769     for (Map.Entry<String, Long> entry : inputToExpectedOutput.entrySet()) {
0770       assertTrue(entry.getKey(), UTF8String.fromString(entry.getKey()).toLong(wrapper));
0771       assertEquals((long) entry.getValue(), wrapper.value);
0772     }
0773 
0774     List<String> negativeInputs = Arrays.asList("", "  ", "null", "NULL", "\n", "~1212121",
0775         "1234567890123456789012345678901234");
0776 
0777     for (String negativeInput : negativeInputs) {
0778       assertFalse(negativeInput, UTF8String.fromString(negativeInput).toLong(wrapper));
0779     }
0780   }
0781 
0782   @Test
0783   public void trimBothWithTrimString() {
0784     assertEquals(fromString("hello"), fromString("  hello ").trim(fromString(" ")));
0785     assertEquals(fromString("o"), fromString("  hello ").trim(fromString(" hle")));
0786     assertEquals(fromString("h e"), fromString("ooh e ooo").trim(fromString("o ")));
0787     assertEquals(fromString(""), fromString("ooo...oooo").trim(fromString("o.")));
0788     assertEquals(fromString("b"), fromString("%^b[]@").trim(fromString("][@^%")));
0789 
0790     assertEquals(EMPTY_UTF8, fromString("  ").trim(fromString(" ")));
0791 
0792     assertEquals(fromString("数据砖头"), fromString("  数据砖头 ").trim());
0793     assertEquals(fromString("数"), fromString("a数b").trim(fromString("ab")));
0794     assertEquals(fromString(""), fromString("a").trim(fromString("a数b")));
0795     assertEquals(fromString(""), fromString("数数 数数数").trim(fromString("数 ")));
0796     assertEquals(fromString("据砖头"), fromString("数]数[数据砖头#数数").trim(fromString("[数]#")));
0797     assertEquals(fromString("据砖头数数 "), fromString("数数数据砖头数数 ").trim(fromString("数")));
0798   }
0799 
0800   @Test
0801   public void trimLeftWithTrimString() {
0802     assertEquals(fromString("  hello "), fromString("  hello ").trimLeft(fromString("")));
0803     assertEquals(fromString(""), fromString("a").trimLeft(fromString("a")));
0804     assertEquals(fromString("b"), fromString("b").trimLeft(fromString("a")));
0805     assertEquals(fromString("ba"), fromString("ba").trimLeft(fromString("a")));
0806     assertEquals(fromString(""), fromString("aaaaaaa").trimLeft(fromString("a")));
0807     assertEquals(fromString("trim"), fromString("oabtrim").trimLeft(fromString("bao")));
0808     assertEquals(fromString("rim "), fromString("ooootrim ").trimLeft(fromString("otm")));
0809 
0810     assertEquals(EMPTY_UTF8, fromString("  ").trimLeft(fromString(" ")));
0811 
0812     assertEquals(fromString("数据砖头 "), fromString("  数据砖头 ").trimLeft(fromString(" ")));
0813     assertEquals(fromString("数"), fromString("数").trimLeft(fromString("a")));
0814     assertEquals(fromString("a"), fromString("a").trimLeft(fromString("数")));
0815     assertEquals(fromString("砖头数数"), fromString("数数数据砖头数数").trimLeft(fromString("据数")));
0816     assertEquals(fromString("据砖头数数"), fromString(" 数数数据砖头数数").trimLeft(fromString("数 ")));
0817     assertEquals(fromString("据砖头数数"), fromString("aa数数数据砖头数数").trimLeft(fromString("a数砖")));
0818     assertEquals(fromString("$S,.$BR"), fromString(",,,,%$S,.$BR").trimLeft(fromString("%,")));
0819   }
0820 
0821   @Test
0822   public void trimRightWithTrimString() {
0823     assertEquals(fromString("  hello "), fromString("  hello ").trimRight(fromString("")));
0824     assertEquals(fromString(""), fromString("a").trimRight(fromString("a")));
0825     assertEquals(fromString("cc"), fromString("ccbaaaa").trimRight(fromString("ba")));
0826     assertEquals(fromString(""), fromString("aabbbbaaa").trimRight(fromString("ab")));
0827     assertEquals(fromString("  he"), fromString("  hello ").trimRight(fromString(" ol")));
0828     assertEquals(fromString("oohell"),
0829         fromString("oohellooo../*&").trimRight(fromString("./,&%*o")));
0830 
0831     assertEquals(EMPTY_UTF8, fromString("  ").trimRight(fromString(" ")));
0832 
0833     assertEquals(fromString("  数据砖头"), fromString("  数据砖头 ").trimRight(fromString(" ")));
0834     assertEquals(fromString("数数砖头"), fromString("数数砖头数aa数").trimRight(fromString("a数")));
0835     assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
0836     assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
0837     assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
0838   }
0839 
0840   @Test
0841   public void skipWrongFirstByte() {
0842     int[] wrongFirstBytes = {
0843       0x80, 0x9F, 0xBF, // Skip Continuation bytes
0844       0xC0, 0xC2, // 0xC0..0xC1 - disallowed in UTF-8
0845       // 0xF5..0xFF - disallowed in UTF-8
0846       0xF5, 0xF6, 0xF7, 0xF8, 0xF9,
0847       0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
0848     };
0849     byte[] c = new byte[1];
0850 
0851     for (int i = 0; i < wrongFirstBytes.length; ++i) {
0852       c[0] = (byte)wrongFirstBytes[i];
0853       assertEquals(1, fromBytes(c).numChars());
0854     }
0855   }
0856 }