hash碰撞的分页

最新推荐文章于 2023-07-24 22:45:25 发布

原创最新推荐文章于 2023-07-24 22:45:25 发布 · 263 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#算法 #单元测试 #数据结构 #J# #Go

本文介绍了一种针对大数据存储的Hash碰撞分页算法，并提供了一个具体的Java实现案例。该算法通过对关键字进行Hash计算，将发生碰撞的记录归为一页单元，以此减少磁盘I/O访问次数。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

在信息的存储的过程中，对于大数据库的存储，为了能保证数据的高效检索，减少IO的访问次数。现有的多数数据库系统采用的是b树或者b+树的索引算法。
一般的做法是：对于存储在物理单元上的数据，进行逻辑分页。然后缓存页面索引。有的甚至进行二次分页。目的只有一个就是尽量减少IO的访问。
检索算法，通过对物理数据的分页，在内存或者外存中维护关键字的页索引数据。当进行数据检索时，通过关键字的页匹配，把外存上的物理数据读入内存，然后再进行匹配。对于大数据的情况下，有可能需要进行多次的页数据读取。
Hash碰撞分页算法：
解决的是逻辑分页的问题。对于物理单元中的数据，对其关键字key进行一次hash计算。由于hash算法本身的特性。把发生碰撞的纪录归为一页单元。然后在对这些页dany进行存储。为了保证页单元的一致性。给页单元分成规定一个大小的页块，在内存在缓存最新访问的页块。而把其他的页块保存在外存上。
检索的代价：
对关键字进行hash碰撞，确定它所在的页。然后对这个页中的页块进行访问。
假如page unit中有n个页块
那么访问IO的平均次数为n/2
因此时间 O(n/2)

算法逻辑结构图：

Java的实现:
这里只实现对关键字的索引维护。程序假定对所有的关键字进行了MD5加密，我这里只是给出一种实用的情况，如果不要md5串也是没有问题的，只要做一点点的程序修改就可以完成。

具体实现：
DiskBtreeStore.java

package com;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

/**
* hash碰撞分页存储实现
*/
public class DiskBtreeStore {
/**
* The default initial capacity - MUST be a power of two.
*/
static final int DEFAULT_INITIAL_CAPACITY = 256;

/**
* The maximum capacity, used if a higher value is implicitly specified
* by either of the constructors with arguments.
* MUST be a power of two <= 1<<30.
*/
static final int MAXIMUM_CAPACITY = 1 << 30;

/**
* The load factor used when none specified in constructor.
**/
static final float DEFAULT_LOAD_FACTOR = 0.75f;

transient MemoryBlockCache table[];

/**
* The number of key-value mappings contained in this identity hash map.
*/
transient int size;

/**
* The next size value at which to resize (capacity * load factor).
* @serial
*/
int threshold;

/**
* The load factor for the hash table.
*
* @serial
*/
final float loadFactor;

//需要做储存的文件流
private FileOutputStream out;
private FileInputStream in;

/**
* Constructs an empty <tt>HashMap</tt> with the specified initial
* capacity and load factor.
*
* @param initialCapacity The initial capacity.
* @param loadFactor The load factor.
* @throws FileNotFoundException
* @throws IllegalArgumentException if the initial capacity is negative
* or the load factor is nonpositive.
*/
public DiskBtreeStore(int initialCapacity, float loadFactor,String filename,boolean reuse) throws FileNotFoundException {
if (initialCapacity < 0)
throw new IllegalArgumentException("Illegal initial capacity: " +
initialCapacity);
if (initialCapacity > MAXIMUM_CAPACITY)
initialCapacity = MAXIMUM_CAPACITY;
if (loadFactor <= 0 || Float.isNaN(loadFactor))
throw new IllegalArgumentException("Illegal load factor: " +
loadFactor);

// Find a power of 2 >= initialCapacity
int capacity = 1;
while (capacity < initialCapacity)
capacity <<= 1;

this.loadFactor = loadFactor;
threshold = (int)(capacity * loadFactor);
table = new MemoryBlockCache[capacity];
out=new FileOutputStream(filename,reuse);
in=new FileInputStream(filename);
}
public DiskBtreeStore(int initialCapacity,String fileName,boolean reuse) throws FileNotFoundException {
this(initialCapacity, DEFAULT_LOAD_FACTOR,fileName,reuse);
}
/**
* Constructs an empty <tt>HashMap</tt> with the default initial capacity
* (16) and the default load factor (0.75).
* @throws FileNotFoundException
*/
public DiskBtreeStore(String filename,boolean reuse) throws FileNotFoundException {
this.loadFactor = DEFAULT_LOAD_FACTOR;
threshold = (int)(DEFAULT_INITIAL_CAPACITY * DEFAULT_LOAD_FACTOR);
table = new MemoryBlockCache[DEFAULT_INITIAL_CAPACITY];
out=new FileOutputStream(filename,reuse);
in=new FileInputStream(filename);
}
/*添加key*/
public void add(String key) {
int hash=hash(key);
int i=indexFor(hash,size);
//能找到当前的key
if(i>-1){
MemoryBlockCache existBlock=table[i];
existBlock.add(key);
}
else{
table[size]=new MemoryBlockCache(hash,out,in);
table[size].add(key);
if(size++>threshold)
resize(table.length+DEFAULT_INITIAL_CAPACITY);
}

}
public boolean contain(String key){
int hash=hash(key);
int i=indexFor(hash,size);
//必须能找到hash
if(i>-1){
MemoryBlockCache existBlock=table[i];
if(existBlock.containKey(key)){
return true;
}
}
return false;
}
/**
* @param hash
* @param length
* @return
*/
private int indexFor(int hash, int length) {
for(int i=0;i<length;i++){
if(table[i].key==hash){
return i;
}
}
return -1;
}
public int size(){
return size;
}
static int hash(Object x) {
int h = x.hashCode();
h += ~(h << 9);
h ^= (h >>> 14);
h += (h << 4);
h ^= (h >>> 10);
return h;
}
void resize(int newCapacity) {
MemoryBlockCache[] oldTable = table;
int oldCapacity = oldTable.length;
if (oldCapacity == MAXIMUM_CAPACITY) {
threshold = Integer.MAX_VALUE;
return;
}

MemoryBlockCache[] newTable = new MemoryBlockCache[newCapacity];
transfer(newTable);
table = newTable;
threshold = (int)(newCapacity * loadFactor);
}
/**
* Transfer all entries from current table to newTable.
*/
void transfer(MemoryBlockCache[] newTable) {
MemoryBlockCache[] src = table;
int newCapacity = newTable.length;
for (int j = 0; j < src.length; j++) {
MemoryBlockCache e = src[j];
newTable[j]=e;
}
}

/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
public String toString() {
StringBuffer buf=new StringBuffer();
for(int i=0;i<size;i++){
buf.append(table[i].toString());
}
return buf.toString();
}
/**
*清除文件IO流
*/
public void clear(){
try {
in.close();
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}

MemoryBlockCache.java
/*
* 用于内存中缓冲数据,并且进行内存的钝化操作
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;

/**
* @author asus
* 代表在内存中的数据块
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class MemoryBlockCache {
private final int MAXLEN=100;
private final int RECORD_SIZE=32;
public int key;
private FileOutputStream out;
private FileInputStream in;
private StringBuffer buf=new StringBuffer(100*32); //缓冲大小100
private List pages=new ArrayList();
public MemoryBlockCache(int hash,FileOutputStream out,FileInputStream in){
key=hash;
this.out=out;
this.in=in;
}
/**
* 检索是否存在所需要的数据
* 主要看这个数据块中有多少页数据
* @param key2
* @return
*/
public boolean containKey(String key) {
if(buf.indexOf(key)>-1)
return true;
for(int i=0;i<pages.size();i++){
//查找需要的页
Page page=(Page) pages.get(i);
byte [] cache=this.readBlock(page.getPageId());
if(cache!=null){
String str=new String(cache);
if(str.indexOf(key)>-1)
return true;
}
}
return false;
}
public void add(String path){
if(buf.length()/RECORD_SIZE<MAXLEN){
buf.append(path);
return;
}
//如果缓存区写满了,把当前缓存的数据写入硬盘
long pageId=writeBlock();
if(pageId>-1){
//写入成功
pages.add(new Page(pageId));
}
//把当前的数据写入数据缓冲
buf.delete(0,MAXLEN*RECORD_SIZE);
buf.append(path);
}
private synchronized long writeBlock(){
String str=buf.toString();
try {
long position=out.getChannel().position();
out.write(str.getBytes());
return position;
} catch (IOException e) {
System.err.println("persist the data is error"+e.getMessage());
return -1; //write error
}
}
private synchronized byte [] readBlock(long pageId){
try {
MappedByteBuffer buf=in.getChannel().map(FileChannel.MapMode.READ_ONLY,pageId,100*32);
if(!buf.isLoaded()){
buf.load();
}
byte []store=buf.array();
buf.clear();
return store;
} catch (IOException e) {
return null;
}
}
public String toString() {
StringBuffer buf=new StringBuffer();
buf.append("(当前缓冲大小为"+this.buf.length());
buf.append(",");
buf.append("硬盘的页为"+pages.toString());
buf.append(")");
return buf.toString();
}
class Page{
public Page(long pageId){
this.pageId=pageId;
}
private long pageId=0; //指示内存中的块数据

/**
* @return Returns the pageId.
*/
public long getPageId() {
return pageId;
}
/**
* @param pageId The pageId to set.
*/
public void setPageId(long pageId) {
this.pageId = pageId;
}
public String toString() {
return String.valueOf(pageId);
}
}
}

测试代码
/*
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com;

import java.io.File;
import java.io.FileNotFoundException;

/**
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class Test {
public static void main(String args[]) throws FileNotFoundException, StoreException{
DiskBtreeStore tree=new DiskBtreeStore("d:\\storefile\\visisted.soosoo",false);
MD5 enc=new MD5();
long t=System.currentTimeMillis();
for(int i=0;i<1000000;i++){
String str=enc.getMD5ofStr("王建华"+i);
tree.add(str);
if(i%100==0)
System.out.println("完成"+i);
}
String find=enc.getMD5ofStr("王建华323123");
long t1=System.currentTimeMillis();
System.out.println(tree.toString());
System.out.println(tree.contain(find));
long t2=System.currentTimeMillis();
System.out.println("碰撞时间"+(t1-t));
System.out.println("运行时间"+(t2-t1));

}
}

通过上面的测试：
维护一张100w级别的数据表索引需要花的时间大概是30s，由于现在很多数据库系统对IO的管理作了一层优化，也许时间可能比这个值小很多。
对建立了索引的数据检索时间最悲观的情况不会超过0.1s，加入索引关键字在内存中几乎时间可以忽略。

当然，索引的维护需要以存储为代价