oracle simhash,网页内容相似度之SimHash算法

发布于:2021-11-29 22:59:54

packagecom.cnblogs.zxub.lucene.similarity;importjava.io.IOException;importjava.math.BigInteger;importjava.util.Map;importjava.util.Set;public classSimHash {private static final int HASH_BITS = 64;private static final BigInteger FNV_64_INIT = newBigInteger("14695981039346656037");private static final BigInteger FNV_64_PRIME = newBigInteger("1099511628211");private static final BigInteger MASK_64 =BigInteger.ONE.shiftLeft(


HASH_BITS).subtract(BigInteger.ONE);privateString hash;privateBigInteger signature;public SimHash(String content) throwsIOException {super();this.setFingerPrint(WordsSpliter.getSplitedWords(content));


}publicString getHash() {return this.hash;


}publicBigInteger getSignature() {return this.signature;


}private void setFingerPrint(MapwordInfos) {int[] featureVector = new int[SimHash.HASH_BITS];


Set words =wordInfos.keySet();for(String word : words) {


BigInteger wordhash= this.fnv1_64_hash(word);for (int i = 0; i < SimHash.HASH_BITS; i++) {


BigInteger bitmask=BigInteger.ONE.shiftLeft(SimHash.HASH_BITS- i - 1);if (wordhash.and(bitmask).signum() != 0) {


featureVector[i]+=wordInfos.get(word);


}else{


featureVector[i]-=wordInfos.get(word);


}


}


}


BigInteger signature=BigInteger.ZERO;


StringBuffer hashBuffer= newStringBuffer();for (int i = 0; i < SimHash.HASH_BITS; i++) {if (featureVector[i] >= 0) {


signature=signature.add(BigInteger.ONE


.shiftLeft(SimHash.HASH_BITS- i - 1));


hashBuffer.append("1");


}else{


hashBuffer.append("0");


}


}this.hash =hashBuffer.toString();this.signature =signature;


}//fnv-1 hash算法,将字符串转换为64位hash值


privateBigInteger fnv1_64_hash(String str) {


BigInteger hash=FNV_64_INIT;int len =str.length();for (int i = 0; i < len; i++) {


hash=hash.multiply(FNV_64_PRIME);


hash=hash.xor(BigInteger.valueOf(str.charAt(i)));


}


hash=hash.and(MASK_64);returnhash;


}public intgetHammingDistance(BigInteger targetSignature) {


BigInteger x= this.getSignature().xor(targetSignature);


String s= x.toString(2);return s.replaceAll("0", "").length();


}public intgetHashDistance(String targetHash) {intdistance;if (this.getHash().length() !=targetHash.length()) {


distance= -1;


}else{


distance= 0;for (int i = 0; i < this.getHash().length(); i++) {if (this.getHash().charAt(i) !=targetHash.charAt(i)) {


distance++;


}


}


}returndistance;


}


}







相关资源:Toad for Oracle 12.1(包含32位、64位版本)绿色注册版

相关推荐

最新更新

猜你喜欢