C版本要迁移到Java版本!
int file_chunk_cdc(int fd, vector* features) {unsigned char buf[BUF_MAX_SIZE] = {0};
unsigned char buf_bz[BUF_MAX_SIZE] = {0};
unsigned char block_buf[BLOCK_MAX_SIZE * 2] = {0};
unsigned char last_block_buf[BLOCK_MAX_SIZE * 2] = {0};
char win_buf[BLOCK_WIN_SIZE + 1] = {0};
unsigned char md5_str[33] = {0};
unsigned char adler_pre_char;
unsigned char md5_checksum[32 + 1] = {0};
unsigned int bpos = 0;
unsigned int rwsize = 0, bzsize = 0;
unsigned int exp_rwsize = BUF_MAX_SIZE;
unsigned int head, tail;
unsigned int block_sz = 0, old_block_sz = 0;
unsigned int hkey = 0;
int ret = 0;
feature_t f = 0;
while(rwsize = read(fd, buf + bpos, exp_rwsize))
{
/* last chunk */
if ((rwsize + bpos + block_sz) < BLOCK_MIN_SIZE){
break;
}
head = 0;
tail = bpos + rwsize;
/* avoid unnecessary computation and comparsion */
if (block_sz < (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE))
{
old_block_sz = block_sz;
block_sz = ((block_sz + tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : block_sz + tail -head;
memcpy(block_buf + old_block_sz, buf + head, block_sz - old_block_sz);
head += (block_sz - old_block_sz);
}
while ((head + BLOCK_WIN_SIZE) <= tail)
{
memcpy(win_buf, buf + head, BLOCK_WIN_SIZE);
/*
* Firstly, i think rabinhash is the best. However, it's performance is very bad.
* After some testing, i found ELF_hash is better both on performance and dedup rate.
* So, EFL_hash is default. Now, adler_hash as default.
*/
if (g_rolling_hash)
{
hkey = (block_sz == (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ? adler32_checksum(win_buf, BLOCK_WIN_SIZE) :
adler32_rolling_checksum(hkey, BLOCK_WIN_SIZE, adler_pre_char, buf[head+BLOCK_WIN_SIZE-1]);
}
else
hkey = g_cdc_chunk_hashfunc(win_buf);
/* get a normal chunk */
if ((hkey % g_block_size) == CHUNK_CDC_R)
{
memcpy(block_buf + block_sz, buf + head, BLOCK_WIN_SIZE);
head += BLOCK_WIN_SIZE;
block_sz += BLOCK_WIN_SIZE;
if (block_sz >= BLOCK_MIN_SIZE)
{
md5(block_buf, block_sz, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_reggile_block_process in file_chunk_cdc");
goto _FILE_CHUNK_CDC_EXIT;
}
*/
block_sz = 0;
}
}
else
{
block_buf[block_sz++] = buf[head++];
/* get an abnormal chunk */
if (block_sz >= BLOCK_MAX_SIZE)
{
md5(block_buf, block_sz, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
/*
if (0 != (ret = dedup_regfile_block_process(block_buf, block_sz,
md5_checksum, fd_ldata, fd_bdata, pos, block_num, metadata, htable)))
{
perror("dedup_reggile_block_process in file_chunk_cdc");
goto _FILE_CHUNK_CDC_EXIT;
}
*/
block_sz = 0;
}
}
/* avoid unnecessary computation and comparsion */
if (block_sz == 0)
{
block_sz = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
BLOCK_MIN_SIZE - BLOCK_WIN_SIZE : tail - head;
memcpy(block_buf, buf + head, block_sz);
head = ((tail - head) > (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE)) ?
head + (BLOCK_MIN_SIZE - BLOCK_WIN_SIZE) : tail;
}
adler_pre_char = buf[head -1];
}
/* read expected data from file to full up buf */
bpos = tail - head;
exp_rwsize = BUF_MAX_SIZE - bpos;
adler_pre_char = buf[head -1];
memmove(buf, buf + head, bpos);
}
/* last chunk */
int last_block_len = ((rwsize + bpos + block_sz) >= 0) ? rwsize + bpos + block_sz : 0;
if (last_block_len > 0)
{
memcpy(last_block_buf, block_buf, block_sz);
memcpy(last_block_buf + block_sz, buf, rwsize + bpos);
md5(last_block_buf, last_block_len, md5_checksum);
f = md5_2_feature(md5_checksum);
VEC_PUSH_BACK(features, &f);
}
_FILE_CHUNK_CDC_EXIT:
return 0;
}