| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 2330 人关注过本帖
标题:怎样将重复的删去!!!!!!!
只看楼主 加入收藏
beyondyf
Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19
等 级:贵宾
威 望:103
帖 子:3282
专家分:12654
注 册:2008-1-21
收藏
得分:0 
初为人父,诸事纷繁。好久没来了,不知道楼主的问题解决的怎么样了。

这里按照包含优先、先占优先的原则制定的去重方案,代码如下。并附上你给的数据的去重结果,看看是否满意,或是否存在漏洞。

使用代码时注意修改输入输出的文件名(在宏里修改)。

程序代码:
#include<stdio.h>
#include<stdlib.h>
#include<malloc.h>
#include<string.h>

#define INPUT_FILE_PATH        "in.txt"
#define OUTPUT_FILE_PATH    "out.txt"

#define LINE_BUFFER_SIZE    0x1000000
#define CAPACITY_GROWTH        0x1000000
#define SPLIT                " ,\t\r\n"

typedef struct
{
    int start;
    int end;
    int line;
    int index;
}NODE;

int get_data(NODE ** dataptr, FILE * file_in)
{
    NODE * p, * tp;
    int capacity, len;
    int i, j, t;
    char *str, * sp;

    str = (char *)malloc(LINE_BUFFER_SIZE);
    p = (NODE *)malloc(sizeof(NODE) * CAPACITY_GROWTH);
    capacity = CAPACITY_GROWTH;
    len = 0;
    for(i = 0; fgets(str, LINE_BUFFER_SIZE, file_in); i++)
    {
        for(j = 0, sp = strtok(str, SPLIT); sp; j++)
        {
            if(len == capacity)
            {
                if((tp = realloc(p, capacity += CAPACITY_GROWTH * sizeof(NODE))) == NULL)
                {
                    puts("Memory low");
                    free(p);
                    free(str);
                    return -1;
                }
                p = tp;
            }
            p[len].start = strtol(sp, NULL, 0);
            if((sp = strtok(NULL, SPLIT)) == NULL)
            {
                printf("Data miss at %d line.\n", i + 1);
                printf("index %d\n", j + 1);
                free(p);
                free(str);
                return -1;
            }
            p[len].end = strtol(sp, NULL, 0);
            if(p[len].start > p[len].end)
            {
                t = p[len].start;
                p[len].start = p[len].end;
                p[len].end = t;
            }
            p[len].line = i;
            p[len].index = j;
            len++;
            sp = strtok(NULL, SPLIT);
        }
    }
    *dataptr = p;
    free(str);
    return len;
}

int cmp_segment(const void * a, const void * b)
{
    int d;
    if(d = ((NODE *)a)->start - ((NODE *)b)->start) return d;
    if(d = ((NODE *)a)->end - ((NODE *)b)->end) return d;
    if(d = ((NODE *)a)->line - ((NODE *)b)->line) return d;
    return ((NODE *)a)->index - ((NODE *)b)->index;
}

int cmp_position(const void * a, const void * b)
{
    int d;
    if(d = ((NODE *)a)->line - ((NODE *)b)->line) return d;
    return ((NODE *)a)->index - ((NODE *)b)->index;
}

void work(NODE * data, int len)
{
    int start, end, i, j;
   
    qsort(data, len, sizeof(NODE), cmp_segment);

    for(j = 0, i = 1; i < len; i++)
    for(;;)
    {
        while(j >= 0 && data[j].start == 0) j--;
        if(j < 0)
        {
            j = i;
            break;
        }
        if(data[i].end <= data[j].end)
        {
            data[i].start = data[i].end = 0;
            break;
        }
        if(data[i].start <= data[j].start)
        {
            data[j].start = data[j].end = 0;
            continue;
        }
        if(data[i].start <= data[j].end)
        {
            data[i].start = data[j].end + 1;
            j = i;
            break;
        }
        j = i;
        break;
    }

    qsort(data, len, sizeof(NODE), cmp_position);
}

int main()
{
    FILE * fin, * fout;
    NODE * data;
    int len, i;

    if((fin = fopen(INPUT_FILE_PATH, "r")) == NULL)
    {
        printf("cannot open file.\n");
        return 0;
    }

    len = get_data(&data, fin);
    fclose(fin);
    if(len < 0) return 0;    

    work(data, len);

    if((fout = fopen(OUTPUT_FILE_PATH, "w")) == NULL)
    {
        printf("cannot create file.\n");
        free(data);
        return 0;
    }
   
    fprintf(fout, "%d,%d", data[0].start, data[0].end);
    for(i = 1; i < len; i++)
    {
        if(data[i].line != data[i - 1].line)
        {
            fprintf(fout, "\n%d,%d", data[i].start, data[i].end);
        }
        else
        {
            fprintf(fout, ",%d,%d", data[i].start, data[i].end);
        }
    }
   
    fclose(fout);
    free(data);
    return 0;
}
贴结果的前几行出来,完整数据在附件里
66999065,66999928,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,66999929,67208755,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,67208756,67213982
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
8377885,8384365,0,0,0,0,0,0,8384366,8403806,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8403807,8404227
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16767166,16770126,0,0,0,0,0,0,0,0,0,0,0,0,16786272,16786573
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,16770127,16786271,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
25071847,25124232,0,0,0,0,0,0,0,0,25170620,25170815
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,25124233,25170619,0,0,0,0,0,0
33546704,33546991,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,33546992,33585644,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,33585645,33586131
out.rar (103.17 KB)



重剑无锋,大巧不工
2013-04-01 14:08
zhou31146001
Rank: 3Rank: 3
等 级:论坛游侠
帖 子:303
专家分:131
注 册:2012-11-28
收藏
得分:0 
回复 41楼 beyondyf
首先祝贺您喜得贵子,是件该好好庆贺的事,祝阖家欢乐,幸福常驻!
那个问题也算基本解决了,您的编程能力真的很强,如果我能有那么优秀的编程能力就好了!!!!!
感谢您这段时间在这个问题上倾注的精力和付出,谢谢!!!!

2013-04-02 20:38
beyondyf
Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19
等 级:贵宾
威 望:103
帖 子:3282
专家分:12654
注 册:2008-1-21
收藏
得分:0 
客气了,能帮你解决点实际问题我也很开心。正好刚拿到我儿子的基因检测报告,看着觉得挺有意思,其中“相关基因”的名称(或者是个代号)有什么规则么?
图片附件: 游客没有浏览图片的权限,请 登录注册

重剑无锋,大巧不工
2013-04-03 20:24
zhou31146001
Rank: 3Rank: 3
等 级:论坛游侠
帖 子:303
专家分:131
注 册:2012-11-28
收藏
得分:0 
相关基因就是一个基因名称,一般一个基因上要有几万到几十万不等个碱基(A\T\C\G),名字都是命名的。
相关基因后面的CC等实际上相当于一个二模体,那就有4的平方个情况了。

恭喜您喜得贵子。
2013-04-07 14:51
快速回复:怎样将重复的删去!!!!!!!
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.040588 second(s), 8 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved