Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Licensed to the Apache Software Foundation (ASF) under one or more
0003  * contributor license agreements.  See the NOTICE file distributed with
0004  * this work for additional information regarding copyright ownership.
0005  * The ASF licenses this file to You under the Apache License, Version 2.0
0006  * (the "License"); you may not use this file except in compliance with
0007  * the License.  You may obtain a copy of the License at
0008  *
0009  *    http://www.apache.org/licenses/LICENSE-2.0
0010  *
0011  * Unless required by applicable law or agreed to in writing, software
0012  * distributed under the License is distributed on an "AS IS" BASIS,
0013  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
0014  * See the License for the specific language governing permissions and
0015  * limitations under the License.
0016  */
0017 
0018 package org.apache.spark.sql.catalyst.expressions;
0019 
0020 import org.apache.spark.unsafe.Platform;
0021 import org.apache.spark.unsafe.types.UTF8String;
0022 import org.junit.Assert;
0023 import org.junit.Test;
0024 
0025 import java.nio.charset.StandardCharsets;
0026 import java.util.HashSet;
0027 import java.util.Random;
0028 import java.util.Set;
0029 
0030 public class HiveHasherSuite {
0031 
0032   @Test
0033   public void testKnownIntegerInputs() {
0034     int[] inputs = {0, Integer.MIN_VALUE, Integer.MAX_VALUE, 593689054, -189366624};
0035     for (int input : inputs) {
0036       Assert.assertEquals(input, HiveHasher.hashInt(input));
0037     }
0038   }
0039 
0040   @Test
0041   public void testKnownLongInputs() {
0042     Assert.assertEquals(0, HiveHasher.hashLong(0L));
0043     Assert.assertEquals(41, HiveHasher.hashLong(-42L));
0044     Assert.assertEquals(42, HiveHasher.hashLong(42L));
0045     Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MIN_VALUE));
0046     Assert.assertEquals(-2147483648, HiveHasher.hashLong(Long.MAX_VALUE));
0047   }
0048 
0049   @Test
0050   public void testKnownStringAndIntInputs() {
0051     int[] inputs = {84, 19, 8};
0052     int[] expected = {-823832826, -823835053, 111972242};
0053 
0054     for (int i = 0; i < inputs.length; i++) {
0055       UTF8String s = UTF8String.fromString("val_" + inputs[i]);
0056       int hash = HiveHasher.hashUnsafeBytes(s.getBaseObject(), s.getBaseOffset(), s.numBytes());
0057       Assert.assertEquals(expected[i], ((31 * inputs[i]) + hash));
0058     }
0059   }
0060 
0061   @Test
0062   public void randomizedStressTest() {
0063     int size = 65536;
0064     Random rand = new Random();
0065 
0066     // A set used to track collision rate.
0067     Set<Integer> hashcodes = new HashSet<>();
0068     for (int i = 0; i < size; i++) {
0069       int vint = rand.nextInt();
0070       long lint = rand.nextLong();
0071       Assert.assertEquals(HiveHasher.hashInt(vint), HiveHasher.hashInt(vint));
0072       Assert.assertEquals(HiveHasher.hashLong(lint), HiveHasher.hashLong(lint));
0073 
0074       hashcodes.add(HiveHasher.hashLong(lint));
0075     }
0076 
0077     // A very loose bound.
0078     Assert.assertTrue(hashcodes.size() > size * 0.95);
0079   }
0080 
0081   @Test
0082   public void randomizedStressTestBytes() {
0083     int size = 65536;
0084     Random rand = new Random();
0085 
0086     // A set used to track collision rate.
0087     Set<Integer> hashcodes = new HashSet<>();
0088     for (int i = 0; i < size; i++) {
0089       int byteArrSize = rand.nextInt(100) * 8;
0090       byte[] bytes = new byte[byteArrSize];
0091       rand.nextBytes(bytes);
0092 
0093       Assert.assertEquals(
0094           HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
0095           HiveHasher.hashUnsafeBytes(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
0096 
0097       hashcodes.add(HiveHasher.hashUnsafeBytes(
0098           bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
0099     }
0100 
0101     // A very loose bound.
0102     Assert.assertTrue(hashcodes.size() > size * 0.95);
0103   }
0104 
0105   @Test
0106   public void randomizedStressTestPaddedStrings() {
0107     int size = 64000;
0108     // A set used to track collision rate.
0109     Set<Integer> hashcodes = new HashSet<>();
0110     for (int i = 0; i < size; i++) {
0111       int byteArrSize = 8;
0112       byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
0113       byte[] paddedBytes = new byte[byteArrSize];
0114       System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length);
0115 
0116       Assert.assertEquals(
0117           HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
0118           HiveHasher.hashUnsafeBytes(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
0119 
0120       hashcodes.add(HiveHasher.hashUnsafeBytes(
0121           paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
0122     }
0123 
0124     // A very loose bound.
0125     Assert.assertTrue(hashcodes.size() > size * 0.95);
0126   }
0127 }