C代码
int file_chunk_cdc(int fd, vector* features) {
unsigned char buf[BUF_MAX_SIZE] = {0};
unsigned char buf_bz[BUF_MAX_SIZE] = {0};
unsigned char block_buf[BLOCK_MAX_SIZE * 2] = {0};
unsigned char last_block_buf[BLOCK_MAX_SIZE * 2] = {0};
char win_buf[BLOCK_WIN_SIZE + 1] = {0};
unsigned char md5_str[33] = {0};
unsigned char adler_pre_char;
unsigned char md5_checksum[32 + 1] = {0};
unsigned int bpos = 0;
unsigned int rwsize = 0, bzsize = 0;
unsigned int exp_rwsize = BUF_MAX_SIZE;
unsigned int head, tail;
unsigned int block_sz = 0, old_block_sz = 0;
unsigned int hkey = 0;
int ret = 0;
feature_t f = 0;
while(rwsize = read(fd, buf + bpos, exp_rwsize))
{
/* last chunk */
if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE){
break;
}
head = 0;
tail = bpos + rwsize;
/* avoid unnecessary computation and comparsion */
if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE))
{
old_block_sz = block_sz;
block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head;
memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
head += (block_sz - old_block_sz);
}
while ((head + BLOCK_WIN_SIZE) <= tail)
{
memcpy(win_buf, buf + head, BLOCK_WIN_SIZE);
/*
* Firstly, i think rabinhash is the best. However, it's performance is very bad.
* After some testing, i found ELF_hash is better both on performance and dedup rate.
* So, EFL_hash is default. Now, adler_hash as default.
*/
if (g_rolling_hash)
{
hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) :
adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]);
}
else
hkey = g_cdc_chunk_hashfunc(win_buf);
/* get a normal chunk */
if ((hkey % g_block_size) == CHUNK_CDC_R)
{
memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE);
head += BLOCK_WIN_SIZE;
block_sz += BLOCK_WIN_SIZE;
if (block_sz >= BLOCK_MIN_SIZE)
{
md5(block_buf, block_sz, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_reggile_block_process in file_chunk_cdc");
goto _FILE_CHUNK_CDC_EXIT;
}
*/
block_sz = 0;
}
}
else
{
block_buf[block_sz++] = buf[head++];
/* get an abnormal chunk */
if (block_sz >= BLOCK_MAX_SIZE)
{
md5(block_buf, block_sz, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_reggile_block_process in file_chunk_cdc");
goto _FILE_CHUNK_CDC_EXIT;
}
*/
block_sz = 0;
}
}
/* avoid unnecessary computation and comparsion */
if (block_sz == 0)
{
block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head;
memcpy(block_buf, buf + head, block_sz);
head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail;
}
adler_pre_char = buf[head -1];
}
/* read expected data from file to full up buf */
bpos = tail - head;
exp_rwsize = BUF_MAX_SIZE - bpos;
adler_pre_char = buf[head -1];
memmove(buf, buf + head, bpos);
}
/* last chunk */
int last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0;
if (last_block_len > 0)
{
memcpy(last_block_buf, block_buf, block_sz);
memcpy(last_block_buf + block_sz, buf, rwsize + bpos);
md5(last_block_buf, last_block_len, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
}
_FILE_CHUNK_CDC_EXIT:
return 0;
}
/* slide block chunk */
int file_chunk_sb(int fd, vector* features) {
char buf[BUF_MAX_SIZE] = {0};
char buf_bz[BUF_MAX_SIZE] = {0};
char win_buf[BLOCK_MAX_SIZE * 2] = {0};
char block_buf[BLOCK_MAX_SIZE * 2] = {0};
char adler_pre_char;
unsigned char md5_checksum[32 + 1] = {0};
unsigned char md5_checksum1[32 + 1] = {0};
char crc_checksum[16] = {0};
unsigned int bpos = 0;
unsigned int slide_sz = 0;
unsigned int rwsize = 0, bzsize = 0, bzsize_f = 0;
unsigned int exp_rwsize = BUF_MAX_SIZE;
unsigned int head, tail;
unsigned int hkey = 0;
unsigned int bflag = 0;
int ret = 0;
hashtable* sb_htable = create_hashtable(g_htab_bucket_nr);
hashtable* sb_htable_crc = create_hashtable(g_htab_bucket_nr);
if (NULL == sb_htable_crc || sb_htable == NULL)
return -1;
feature_t f, f1;
while(rwsize = read(fd, buf + bpos, exp_rwsize)) {
/* last chunk */
/*
if ((rwsize + bpos + slide_sz) < g_block_size)
break;
*/
head = 0;
tail = bpos + rwsize;
while ((head + g_block_size) <= tail) {
memcpy(win_buf, buf + head, g_block_size);
hkey = (slide_sz == 0) ? adler32_checksum(win_buf, g_block_size) :
adler32_rolling_checksum(hkey, g_block_size, adler_pre_char, buf[head+g_block_size-1]);
uint_2_str(hkey, crc_checksum);
/* bflag: 0, both CRC and MD5 are not idenitical
1, both CRC and MD5 are identical
2, CRC is identical and MD5 is not
*/
bflag = 0;
/* this block maybe is duplicate */
bzsize = g_block_size;
if (hash_exist(sb_htable_crc, crc_checksum))
{
bflag = 2;
md5((unsigned char*)win_buf, bzsize, md5_checksum);
f = md5_2_feature(md5_checksum);
md5_2_str(md5_checksum);
if (hash_exist(sb_htable, (char*)md5_checksum))
{
/* insert fragment */
if (slide_sz != 0)
{
md5((unsigned char*)block_buf, slide_sz, md5_checksum1);
f1 = md5_2_feature(md5_checksum1);
VEC_PUSH_BACK(features, &f1);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, slide_sz, md5_checksum1,
fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_regfile_block_process in file_chunk_sb");
goto _FILE_CHUNK_SB_EXIT;
}
*/
}
VEC_PUSH_BACK(features, &f);
/* insert fixed-size block */
/*
if (0 != (ret = dedup_regfile_block_process(win_buf, bzsize, md5_checksum,
fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_regfile_block_process in file_chunk_sb");
goto _FILE_CHUNK_SB_EXIT;
}
*/
head += g_block_size;
slide_sz = 0;
bflag = 1;
}
}
/* this block is not duplicate */
if (bflag != 1)
{
block_buf[slide_sz++] = buf[head++];
if (slide_sz == g_block_size)
{
bzsize = g_block_size;
/* calculate checksum and check in */
hkey = adler32_checksum(block_buf, bzsize);
uint_2_str(hkey, crc_checksum);
hash_checkin(sb_htable_crc, crc_checksum);
md5((unsigned char*)block_buf, bzsize, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, bzsize, md5_checksum,
fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_regfile_block_process in file_chunk_sb");
goto _FILE_CHUNK_SB_EXIT;
}
*/
slide_sz = 0;
}
}
adler_pre_char = buf[head - 1];
}
/* read expected data from file to full up buf */
bpos = tail - head;
exp_rwsize = BUF_MAX_SIZE - bpos;
adler_pre_char = buf[head - 1];
memmove(buf, buf + head, bpos);
}
/* last chunk */
/*
*last_block_len = ((rwsize + bpos + slide_sz) > 0) ? rwsize + bpos + slide_sz : 0;
if (*last_block_len > 0)
{
memcpy(last_block_buf, block_buf, slide_sz);
memcpy(last_block_buf + slide_sz, buf, rwsize + bpos);
}
*/
_FILE_CHUNK_SB_EXIT:
lseek(fd, 0, SEEK_SET);
hash_free(sb_htable);
hash_free(sb_htable_crc);
return 0;
}
java代码
package cn.edu.cust.deduple;
import
import
import
import
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import cn.edu.cust.deduple.utils.Checksum;
import cn.edu.cust.deduple.utils.MapUtils;
import cn.edu.cust.deduple.utils.Md5Util;
/**
* @ClassName: CDC
* @Description:
* @author: 王大鸟
* @date: 2016/5/18
*/
public class CDC {
/**
* rabin指纹之除数
*/
private static final int D = 5;
/**
* rabin指纹之商
*/
private static final int F = 3;
/**
* 最大值
*/
private static final int MAX = 8 * 1024;
/**
* 最小值
*/
private static final int MIN = 4 * 1024;
/**
* 滑动窗口大小
*/
private static final int WINLEN = 4 * 1024;
private static Map<String, Long> fingerPrints = new HashMap<String, Long>();
static final int BUF_MAX_SZ = 128 * 1024;
static final int BLOCK_MAX_SZ = 4096;
static final int BLOCK_WIN_SZ = 32;
static final int BLOCK_MIN_SZ = 64;
static final int BLOCK_SZ = 4096;
static final int CHUNK_CDC_R = 13;
FileChunks fileChunk(File f) throws IOException {
byte buf[] = new byte[BUF_MAX_SZ];
byte block_buf[] = new byte[BLOCK_MAX_SZ + BLOCK_WIN_SZ];
byte win_buf[] = new byte[BLOCK_WIN_SZ + 1];
byte adler_pre_char = 0;
//unsigned char md5_checksum[16 + 1] = {0};
//unsigned char csum[10 + 1] = {0};
int bpos = 0;
int rwsize = 0;
int exp_rwsize = BUF_MAX_SZ;
int head, tail;
int block_sz = 0, old_block_sz = 0;
long hkey = 0;
//chunk_block_entry chunk_bentry;
long offset = 0;
FileInputStream fin = new FileInputStream(f);
FileChannel fc = fin.getChannel();
ByteBuffer bb = ByteBuffer.wrap(buf, bpos, exp_rwsize);
FileChunks fcs = new FileChunks();
while((rwsize = fc.read(bb)) >= 0) {
/* last chunk */
System.out.println("rwsize:" + rwsize);
if ((rwsize + bpos + block_sz) < BLOCK_MIN_SZ)
break;
head = 0;
tail = bpos + rwsize;
/* avoid unnecessary computation and comparsion */
if (block_sz < (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) {
old_block_sz = block_sz;
block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
BLOCK_MIN_SZ - BLOCK_WIN_SZ : block_sz + tail -head;
System.arraycopy(buf, head, block_buf, old_block_sz, block_sz - old_block_sz);
//memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
head += (block_sz - old_block_sz);
}
while ((head + BLOCK_WIN_SZ) <= tail) {
System.arraycopy(buf, head, win_buf, 0, BLOCK_WIN_SZ);
//memcpy(win_buf, buf + head, BLOCK_WIN_SZ);
hkey = (block_sz == (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ? Checksum.adler32_checksum(win_buf, BLOCK_WIN_SZ) :
Checksum.adler32_rolling_checksum((int)hkey, BLOCK_WIN_SZ, adler_pre_char, buf[head+BLOCK_WIN_SZ-1]);
//System.out.println("hkey:" + (hkey % BLOCK_SZ));
/* get a normal chunk, write block info to chunk file */
if ((hkey % BLOCK_SZ) == CHUNK_CDC_R) {
//System.out.println(block_sz + BLOCK_WIN_SZ);
System.arraycopy(buf, head, block_buf, block_sz, BLOCK_WIN_SZ);
//memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SZ);
head += BLOCK_WIN_SZ;
block_sz += BLOCK_WIN_SZ;
if(block_sz > BLOCK_MAX_SZ){
System.out.println(">4096:" + block_sz);
}
if (block_sz >= BLOCK_MIN_SZ) {
fcs.addChunk(offset, block_sz, block_buf);
/*md5(block_buf, block_sz, md5_checksum);
uint_2_str(adler32_checksum(block_buf, block_sz), csum);
chunk_file_hdr->block_nr++;
chunk_bentry.len = block_sz;
chunk_bentry.offset = offset;
memcpy(chunk_bentry.md5, md5_checksum, 16 + 1);
memcpy(chunk_bentry.csum, csum, 10 + 1);
rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ);
if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ)
return -1;*/
offset += block_sz;
block_sz = 0;
}
} else {
block_buf[block_sz++] = buf[head++];
/* get an abnormal chunk, write block info to chunk file */
if (block_sz >= BLOCK_MAX_SZ) {
fcs.addChunk(offset, block_sz, block_buf);
/*md5(block_buf, block_sz, md5_checksum);
uint_2_str(adler32_checksum(block_buf, block_sz), csum);
chunk_file_hdr->block_nr++;
chunk_bentry.len = block_sz;
chunk_bentry.offset = offset;
memcpy(chunk_bentry.md5, md5_checksum, 16+1);
memcpy(chunk_bentry.csum, csum, 10 + 1);
rwsize = write(fd_chunk, &chunk_bentry, CHUNK_BLOCK_ENTRY_SZ);
if (rwsize == -1 || rwsize != CHUNK_BLOCK_ENTRY_SZ)
return -1;*/
offset += block_sz;
block_sz = 0;
}
}
/* avoid unnecessary computation and comparsion */
if (block_sz == 0) {
block_sz = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
BLOCK_MIN_SZ - BLOCK_WIN_SZ : tail - head;
System.arraycopy(buf, head, block_buf, 0, block_sz);
//memcpy(block_buf, buf + head, block_sz);
head = ((tail - head) > (BLOCK_MIN_SZ - BLOCK_WIN_SZ)) ?
head + (BLOCK_MIN_SZ - BLOCK_WIN_SZ) : tail;
}
adler_pre_char = buf[head - 1];
}
/* read expected data from file to full up buf */
bpos = tail - head;
exp_rwsize = BUF_MAX_SZ - bpos;
adler_pre_char = buf[head - 1];
System.arraycopy(buf, head, buf, 0, bpos);
//memmove(buf, buf + head, bpos);
bb = ByteBuffer.wrap(buf, bpos, exp_rwsize);
}
fin.close();
return fcs;
/*if (rwsize == -1)
return -1;
return 0;*/
}
/**
* CDC算法实现
*
* @param file
* @throws IOException
* @throws NoSuchAlgorithmException
*/
public void process(File file) throws IOException, NoSuchAlgorithmException {
byte[] win_buf = new byte[WINLEN];
long win_tail = 0;
String fingerPrint = null;
byte[] buf = new byte[MAX];
RandomAccessFile randomAccessFile = new RandomAccessFile(file, "r");
long fileLength = randomAccessFile.length();
while (randomAccessFile.read(win_buf) > 0) {
long filePointer = randomAccessFile.getFilePointer();
//如果大于最大值
if (filePointer - win_tail > MAX) {
randomAccessFile.seek(win_tail);
randomAccessFile.read(buf);
fingerPrint = Md5Util.getMD5(buf);
MapUtils.judgeKey(fingerPrints, fingerPrint);
win_tail = filePointer;
continue;
}
//满足rabin指纹
if ((Arrays.hashCode(win_buf) % D == F) && !((filePointer - win_tail) < MIN)) {
offset(randomAccessFile, filePointer, win_tail);
win_tail = filePointer;
} else if (filePointer == fileLength) {
offset(randomAccessFile, filePointer, win_tail);
break;
} else {
randomAccessFile.seek(filePointer - (4 * 1024) + 1);
}
}
}
/**
* 重复代码封装
*
* @param randomAccessFile
* @param filePointer
* @param win_tail
* @throws IOException
* @throws NoSuchAlgorithmException
*/
public void offset(RandomAccessFile randomAccessFile, long filePointer, long win_tail) throws IOException, NoSuchAlgorithmException {
byte[] tmpbytes = new byte[Integer.parseInt(String.valueOf(filePointer - win_tail))];
randomAccessFile.seek(win_tail);
randomAccessFile.read(tmpbytes);
String fingerPrint = Md5Util.getMD5(tmpbytes);
MapUtils.judgeKey(fingerPrints, fingerPrint);
}
public static void main(String[] args) {
try{
CDC cdc = new CDC();
FileChunks fcs = cdc.fileChunk(new File("/home/qh/下载/SPAS data set/2.txt"));
//fcs.printChunks();
}catch(IOException ex){
ex.printStackTrace();
}
}
}