| 网站首页 | 业界新闻 | 小组 | 威客 | 人才 | 下载频道 | 博客 | 代码贴 | 在线编程 | 编程论坛
欢迎加入我们,一同切磋技术
用户名:   
 
密 码:  
共有 12577 人关注过本帖, 1 人收藏
标题:用c写一个爬虫软件
只看楼主 加入收藏
金金金
Rank: 2
等 级:论坛游民
帖 子:16
专家分:19
注 册:2016-8-25
结帖率:50%
收藏(1)
已结贴  问题点数:20 回复次数:11 
用c写一个爬虫软件
求基本思路。。。。。。
搜索更多相关主题的帖子: 软件 
2016-08-30 10:47
金金金
Rank: 2
等 级:论坛游民
帖 子:16
专家分:19
注 册:2016-8-25
收藏
得分:0 
也可以提供代码。。。。。。
2016-08-30 10:47
ehszt
Rank: 12Rank: 12Rank: 12
等 级:贵宾
威 望:40
帖 子:1745
专家分:3216
注 册:2015-12-2
收藏
得分:0 
病毒吗?不要干坏事呀大哥!
2016-08-30 11:13
书生牛犊
Rank: 14Rank: 14Rank: 14Rank: 14
来 自:星夜征程
等 级:贵宾
威 望:10
帖 子:1101
专家分:5265
注 册:2015-10-27
收藏
得分:4 
我没接触过爬虫,但我的印象是这样的:

爬虫不就是把网页里的数据根据规则复制出来,然后整理一下嘛。。。

整理的时候确实可以用到编程语言(但是不同的网页应该是要对应不同的规则吧),爬数据的时候不就F12,Ctrl+C ,Ctrl+V.

φ(゜▽゜*)♪
2016-08-30 13:35
金金金
Rank: 2
等 级:论坛游民
帖 子:16
专家分:19
注 册:2016-8-25
收藏
得分:0 
回复 3楼 ehszt
不是病毒,,只是一个软件
2016-08-30 14:07
金金金
Rank: 2
等 级:论坛游民
帖 子:16
专家分:19
注 册:2016-8-25
收藏
得分:0 
回复 4楼 书生牛犊
对,, 就是需要一个复制网页所有数据,并且是用c写的软件,,,我觉得跟搜索引擎有关。
2016-08-30 14:09
平常心q
Rank: 7Rank: 7Rank: 7
等 级:黑侠
威 望:7
帖 子:120
专家分:550
注 册:2016-3-31
收藏
得分:4 

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <winsock2.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <locale.h>

#define bzero(p, s) memset(p, 0, s)
#define USERAGENT "Wget/1.10.2"
#define ACCEPT "*/*"
#define ACCEPTLANGUAGE "zh-cn,zh;q=0.5"
#define ACCEPTENCODING "gzip,deflate"
#define ACCEPTCHARSET "gb2312,utf-8;q=0.7,*;q=0.7"
#define KEEPALIVE "300"
#define CONNECTION "keep-alive"
#define CONTENTTYPE "application/x-www-form-urlencoded"

#define MAXFILENAME 14
#define DEBUG 1

typedef struct webnode
{
    char * host;                 /* 网页所在的主机 */
    int    port;                 /* 网络服务器所使用的端口 */
    char *  dir;                 /* 网页所在的目录 */
    char * page;                 /* 网页文件名 */
    char * file;                 /* 本地保存的文件名 */
    char IsHandled;              /* 是否处理过 */
    struct webnode * brother;    /* 兄弟节点链表指针 */
    struct webnode * child;      /* 子节点链表指针 */
} WEBNODE;

struct sockaddr_in server_addr;
int sockfd = 0, dsend = 0, totalsend = 0, nbytes = 0, reqn = 0, i = 0, j = 0, ret = 0;
struct hostent *host;
char request[409600] = "", buffer[1024] = "", httpheader[1024] = "";
int FileNumber = 0;
char e[2] = "@/";
WEBNODE * NodeHeader, * NodeTail, * NodeCurr;
char * mapped_mem;

int GetHost(char * , char ** , char ** , int * , char ** ); /**/
void AnalyzePage(WEBNODE *); /**/
void AddInitNode(char *, char *, int, char * ); /**/
void HandleInitNode(WEBNODE *); /**/
void DisplayNode(WEBNODE *); /**/
void HandOneNode(WEBNODE *); /**/
void DoneWithList(int); /**/
void DoOnce(); /**/
void ConnectWeb(void); /**/
void SendRequest(void); /**/
void ReceiveResponse(void); /**/
void GetEmail(char * ); /**/
void GetLink(char * ); /**/
void GetBeforePos(char * , char ** ); /**/
void GetAfterPos(char * , char ** ); /**/
void AddChildNode(WEBNODE * , char * ); /**/
void GetAfterPosWithSlash(char * , char ** ); /**/
void GetMemory(char ** , int ); /**/
int IsExistWeb(WEBNODE * , char * , char * , int , char * ); /**/
void Rstrchr(char * , int , char ** ); /**/
int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType); /**/

/**************************************************************
功能:设置 HTTP 协议头内容的一些固定值
***************************************************************/
int GetLocalAgent(char * UserAgent, char * Accept, char * AcceptLanguage, char * AcceptEncoding, char * AcceptCharset, char * KeepAlive, char * Connection, char * ContentType)
{
    memcpy(UserAgent, USERAGENT, strlen(USERAGENT));
    memcpy(Accept, ACCEPT, strlen(ACCEPT));
    memcpy(AcceptLanguage, ACCEPTLANGUAGE, strlen(ACCEPTLANGUAGE));
    memcpy(AcceptEncoding, ACCEPTENCODING, strlen(ACCEPTENCODING));
    memcpy(AcceptCharset, ACCEPTCHARSET, strlen(ACCEPTCHARSET));
    memcpy(KeepAlive, KEEPALIVE, strlen(KEEPALIVE));
    memcpy(Connection, CONNECTION, strlen(CONNECTION));
    memcpy(ContentType, CONTENTTYPE, strlen(CONTENTTYPE));
    return 0;
}

/**************************************************************
功能:在字符串 s 里搜索 x 字符,并设置指针 d 指向该位置
***************************************************************/
void Rstrchr(char * s, int x, char ** d)
{
    int len = strlen(s) - 1;
    while(len >= 0)
    {
        if(x == s[len]) {(*d) = s + len; return;}
        len--;
    }
    (*d) = 0;
}

/**************************************************************
功能:连接一个网站服务器
***************************************************************/
void ConnectWeb(void)
{
    /* connect to web server */
      /* create a socket descriptor */
    if((sockfd=socket(PF_INET,SOCK_STREAM,0))==-1)
    {
        fprintf(stderr,"\tSocket Error:%s\a\n",strerror(errno));
        exit(1);
    }

    /* bind address */
    bzero(&server_addr, sizeof(server_addr));
    server_addr.sin_family = AF_INET;
    server_addr.sin_port = htons(NodeCurr->port);
    server_addr.sin_addr = *((struct in_addr *)host->h_addr);

    /* connect to the server */
    if(connect(sockfd, (struct sockaddr *)(&server_addr), sizeof(struct sockaddr)) == -1)
    {
        fprintf(stderr, "\tConnect Error:%s\a\n", strerror(errno));
        exit(1);
    }
}

/**************************************************************
功能:向网站发送 HTTP 请求
***************************************************************/
void SendRequest(void)
{
    /* send my http-request to web server */
    dsend = 0;totalsend = 0;
    nbytes=strlen(request);
    while(totalsend < nbytes)
    {
        dsend = write(sockfd, request + totalsend, nbytes - totalsend);
        if(dsend==-1)  {fprintf(stderr, "\tsend error!%s\n", strerror(errno));exit(0);}
        totalsend+=dsend;
        fprintf(stdout, "\n\tRequest.%d %d bytes send OK!\n", reqn, totalsend);
    }
}

/**************************************************************
功能:接收网站的 HTTP 返回
***************************************************************/
void ReceiveResponse(void)
{
    /* get response from web server */
    fd_set writefds;
    struct timeval tival;
    int retry = 0;
    FILE * localfp = NULL;

    i=0; j = 0;
    __ReCeive:
    FD_ZERO(&writefds);
    tival.tv_sec = 10;
    tival.tv_usec = 0;
    if(sockfd > 0) FD_SET(sockfd, &writefds);
    else {fprintf(stderr, "\n\tError, socket is negative!\n"); exit(0);}

    ret = select(sockfd + 1, &writefds, NULL, NULL, &tival);
    if(ret ==0 )
    {
        if(retry++ < 10) goto __ReCeive;
    }
    if(ret <= 0) {fprintf(stderr, "\n\tError while receiving!\n"); exit(0);}

    if(FD_ISSET(sockfd, &writefds))
    {
        memset(buffer, 0, 1024);
        memset(httpheader, 0, 1024);
        if((localfp = fopen(NodeCurr->file, "w")) == NULL) {if(DEBUG) fprintf(stderr, "create file '%s' error\n", NodeCurr->file); return;}
        /* receive data from web server */
        while((nbytes=read(sockfd,buffer,1))==1)
        {
            if(i < 4)
            {
                /* 获取 HTTP 消息头 */
                    if(buffer[0] == '\r' || buffer[0] == '\n')  i++;
                    else i = 0;
                    memcpy(httpheader + j, buffer, 1); j++;
            }
            else
            {
                /* 获取 HTTP 消息体 */
                    fprintf(localfp, "%c", buffer[0]); /* print content on the screen */
                    i++;
            }//else
        }//while
        fclose(localfp);
    }//if(FD_ISSET(sockfd, &writefds))
}

/**************************************************************
功能:执行一次 HTTP 请求
***************************************************************/
void DoOnce()
{
    /* send and receive */
    ConnectWeb(); /* connect to the web server */

    /* send a request */
    SendRequest();

    /* receive a response message from web server */
    ReceiveResponse();

    close(sockfd); /* because HTTP protocol do something one connection, so I can close it after receiving */
}

/**************************************************************
功能:执行 HTTP 请求
***************************************************************/
void DoneWithList(int flag)
{
    if(flag) fprintf(stdout, "\tRequest.%d is:\n%s", ++reqn, request);

    DoOnce();

    if(flag) fprintf(stdout, "\n\tThe following is the response header:\n%s", httpheader);
}

/**************************************************************
功能:从字符串 src 中分析出网站地址和端口,并得到文件和目录
***************************************************************/
int GetHost(char * src, char ** web, char ** file, int * port, char ** dir)
{
    char * pA, * pB, * pC;
    int len;

    *port = 0;
    if(!(*src))  return -1;
    pA = src;
    if(!strncmp(pA, "http://", strlen("http://")))  pA = src+strlen("http://");
    else return 1;
    pB = strchr(pA, '/');
    if(pB)
    {
        len = strlen(pA) - strlen(pB);
        GetMemory(web, len);
        memcpy((*web), pA, len);
        if(*(pB+1))
        {
            Rstrchr(pB + 1, '/', &pC);
            if(pC) len = strlen(pB + 1) - strlen(pC);
            else len = 0;
            if(len > 0)
            {
                GetMemory(dir, len);
                memcpy((*dir), pB + 1, len);

                if(pC + 1)
                {
                    len = strlen(pC + 1);
                    GetMemory(file, len);
                    memcpy((*file), pC + 1, len);
                }
                else
                {
                    len = 1;
                    GetMemory(file, len);
                    memcpy((*file), e, len);
                }// else
            }// if(len > 0)
            else
            {
                len = 1;
                GetMemory(dir, len);
                memcpy((*dir), e + 1, len);

                len = strlen(pB + 1);
                GetMemory(file, len);
                memcpy((*file), pB + 1, len);
            }// else
        }// if(*(pB+1))
        else
        {
            len = 1;
            GetMemory(dir, len);
            memcpy((*dir), e + 1, len);

            len = 1;
            GetMemory(file, len);
            memcpy((*file), e, len);
        }// else
    }// if(pB)
    else
    {
        len = strlen(pA);
        GetMemory(web, len);
        memcpy((*web), pA, strlen(pA));
        len = 1;
        GetMemory(dir, len);
        memcpy((*dir), e + 1, len);
        len = 1;
        GetMemory(file, len);
        memcpy((*file), e, len);
    }// else

    pA = strchr((*web), ':');
    if(pA)  *port = atoi(pA + 1);
    else *port = 80;

    return 0;
}

/*********************************************************************
*filename: mailaddrsearch.c
*purpose: 用 C 语言编写一个网络蜘蛛来搜索网上出现的电子邮件地址
*tidied by: zhoulifa(zhoulifa@) 周立发(http://zhoulifa.)
Linux爱好者 Linux知识传播者 SOHO族 开发者 最擅长C语言
*date time:2006-08-31 21:00:00
*Note: 任何人可以任意复制代码并运用这些文档,当然包括你的商业用途
* 但请遵循GPL
*Thanks to: www. 广东省 Linux 公共服务技术支持中心
*********************************************************************/

int main(int argc, char ** argv)
{
    int WebPort;
    char * WebHost = 0, * PageAddress = 0, * WebDir = 0;

    if(argc < 2) {if(DEBUG) fprintf(stdout, "Command error, you should input like this:\n\t%s WebPageAddress1 WebPageAddress2 WebPageAddress3 ...", argv[0]); exit(0);}

    NodeHeader = NodeTail = NodeCurr = 0;

    for(i = 1; i < argc; i++)
    {
        ret = GetHost(argv, &WebHost, &PageAddress, &WebPort, &WebDir); /* Get web page info */
        if(ret) {if(DEBUG) fprintf(stdout, "GetHost error from '%s'\n", argv); exit(0);}
        AddInitNode(WebHost, PageAddress, WebPort, WebDir); /* add this page to chain */
    }
    free(WebHost); free(PageAddress);free(WebDir);
    if(DEBUG)
    {
        fprintf(stdout, "\nDisplay.%5d:", FileNumber);
        DisplayNode(NodeHeader); /* display every node */
    }
    HandleInitNode(NodeHeader); /* handle every page */
    return 0;
}

/**************************************************************
功能:分析网页
***************************************************************/
void AnalyzePage(WEBNODE * node)
{
    int fd;
    int flength = 0;
    fd = open(node->file, O_RDONLY);
    if(fd == -1)        goto __AnalyzeDone;
    flength = lseek(fd, 1, SEEK_END);
    write(fd, "\0", 1);
    lseek(fd, 0, SEEK_SET);
    mapped_mem = mmap(0, flength, PROT_READ, MAP_PRIVATE, fd, 0);
    GetEmail(mapped_mem);
    GetLink(mapped_mem);
    close(fd);
    munmap(mapped_mem, flength);
__AnalyzeDone:
    close(fd);
    node->IsHandled = 1;
    remove(node->file);
}

/**************************************************************
功能:为根节点设置兄弟节点
***************************************************************/
void AddInitNode(char * Host, char * Page, int Port, char * Dir)
{
    WEBNODE * NewNode;
    char filename[MAXFILENAME + 1] = "";

    if(NodeHeader == NULL) NewNode = NodeHeader = (WEBNODE *)malloc(sizeof(WEBNODE));
    else NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));
    memset(NewNode, 0, sizeof(WEBNODE));
    NewNode->host = (char *)malloc(strlen(Host) + 1);
    memset(NewNode->host, 0, strlen(Host) + 1);
    NewNode->page = (char *)malloc(strlen(Page) + 1);
    memset(NewNode->page, 0, strlen(Page) + 1);
    NewNode->dir = (char *)malloc(strlen(Dir) + 1);
    memset(NewNode->dir, 0, strlen(Dir) + 1);
    NewNode->file = (char *)malloc(MAXFILENAME + 1);
    memset(NewNode->file, 0, MAXFILENAME + 1);
    strcpy(NewNode->host, Host);
    strcpy(NewNode->page, Page);
    strcpy(NewNode->dir, Dir);
    sprintf(filename, "file%05d.html", FileNumber++);
    strcpy(NewNode->file, filename);
    NewNode->port = Port;
    NewNode->IsHandled = 0;
    NewNode->brother = 0;
    NewNode->child = 0;
    NodeTail = NewNode;
}

/**************************************************************
功能:处理根节点信息
***************************************************************/
void HandleInitNode(WEBNODE * node)
{
    WEBNODE * CurrentNode = 0;
    CurrentNode = node;
    if(CurrentNode)
    {
        while(CurrentNode)
        {
            if(CurrentNode->IsHandled == 0)
            {
                HandOneNode(CurrentNode);
                if(DEBUG)
                {
                    fprintf(stdout, "\nDisplay.%5d:", FileNumber);
                    DisplayNode(NodeHeader); /* display every node */
                }
            }
            CurrentNode = CurrentNode->brother;
        }// while
        CurrentNode = node;
        while(CurrentNode)
        {
            if(CurrentNode->child && CurrentNode->child->IsHandled == 0)
            {
                HandleInitNode(CurrentNode->child);
            }
            CurrentNode = CurrentNode->brother;
        }// while
    }
}

/**************************************************************
功能:显示年有节点信息
***************************************************************/
void DisplayNode(WEBNODE * NodeHeader)
{
    WEBNODE * TempNode;
    TempNode = NodeHeader;
    fprintf(stdout, "\n");
    while(TempNode)
    {
        if(!strcmp(TempNode->dir, "/"))
            fprintf(stdout, "\t%s:%d%s%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
        else
            fprintf(stdout, "\t%s:%d/%s/%s => %s %d\n", TempNode->host, TempNode->port, TempNode->dir, strcmp(TempNode->page, "@")?TempNode->page:"", TempNode->file, TempNode->IsHandled);
        TempNode = TempNode->brother;
    }// while

    TempNode = NodeHeader;
    while(TempNode)
    {
        if(TempNode->child)
            DisplayNode(TempNode->child);
        TempNode = TempNode->brother;
    }// while
}

/**************************************************************
功能:处理单个节点信息
***************************************************************/
void HandOneNode(WEBNODE * node)
{
    char UserAgent[1024] = "",
        Accept[1024] = "",
        AcceptLanguage[1024] = "",
        AcceptEncoding[1024] = "",
        AcceptCharset[1024] = "",
        KeepAlive[1024] = "",
        Connection[1024] = "",
        ContentType[1024] = "";

    NodeCurr = node;
    if((host=gethostbyname(NodeCurr->host))==NULL) /* get ip address by domain */
    {
        if(DEBUG)  fprintf(stderr,"\tGethostname '%s' error, %s\n", NodeCurr->host, strerror(errno));
        exit(1);
    }
    GetLocalAgent(UserAgent, Accept, AcceptLanguage, AcceptEncoding, AcceptCharset, KeepAlive, Connection, ContentType); /* Get client browser information */

    if(strcmp(NodeCurr->dir, "/"))
        sprintf(request, "GET /%s/%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);
    else
        sprintf(request, "GET %s%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\nAccept: %s\r\nConnection: %s\r\n\r\n", NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"", NodeCurr->host, UserAgent, Accept, Connection);

    DoneWithList(1);
    AnalyzePage(NodeCurr);
}

/**************************************************************
功能:从字符串 src 中分析出邮件地址保存到文件
***************************************************************/
void GetEmail(char * src)
{
    char * pa, * pb, * pc, *pd;
    char myemail[1024] = "";
    FILE * mailfp = NULL;
    if((mailfp = fopen("email.txt", "a+")) == NULL)
    return;
    pa = src;
    while((pb = strchr(pa, '@')))
    {
        GetBeforePos(pb, &pc);
        GetAfterPos(pb, &pd);
        if(pc && pd && (strlen(pc) > (strlen(pd) + 3)))
        {
            memset(myemail, 0, 1024);
            memcpy(myemail, pc, strlen(pc) - strlen(pd));
            if(strcmp(NodeCurr->dir, "/")) fprintf(mailfp, "%s\thttp://%s/%s/%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
            else  fprintf(mailfp, "%s\thttp://%s%s%s\n", myemail, NodeCurr->host, NodeCurr->dir, strcmp(NodeCurr->page, "@")?NodeCurr->page:"");
            if(*(pd + 1))
                pa = pd + 1;
            else break;
        }
        else if(*(pb + 1))
            pa = pb + 1;
        else
            break;
    }// while
    fclose(mailfp);
}

/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即 email 地址中 @ 的前面部分
***************************************************************/
void GetBeforePos(char * src, char ** d)
{
    char * x;
    if(src - 1)
        x = src - 1;
    else {*d = 0; return ;}
    while(x)
    {
        if(*x >= 'a' && *x <= 'z') {x--; continue;}
        else if(*x >= 'A' && *x <= 'Z') {x--; continue;}
        else if(*x >= '0' && *x <= '9') {x--; continue;}
        else if(*x == '.' || *x == '-' || *x == '_') {x--; continue;}
        else {break;}
    }
    x++;
    if(x) *d = x;
    else *d = 0;
}

/**************************************************************
功能:从 src 中找出后面的字母、数字等内含,即 email 地址中 @ 的后面部分
***************************************************************/
void GetAfterPos(char * src, char ** d)
{
    char * x;
    if(src + 1)
        x = src + 1;
    else {*d = 0; return ;}

    while(x)
    {
        if(*x >= 'a' && *x <= 'z') {x++; continue;}
        else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
        else if(*x >= '0' && *x <= '9') {x++; continue;}
        else if(*x == '.' || *x == '-' || *x == '_') {x++; continue;}
        else {break;}
    }

    if(x) *d = x;
    else *d = 0;
}

/**************************************************************
功能:从 src 中找出前面的字母、数字等内含,即一个网页地址中主机名后面的部分
***************************************************************/
void GetAfterPosWithSlash(char * src, char ** d)
{
    char * x;
    if(src)
    x = src;
    else {*d = 0; return ;}

    while(x)
    {
        if(*x >= 'a' && *x <= 'z') {x++; continue;}
        else if(*x >= 'A' && *x <= 'Z') {x++; continue;}
        else if(*x >= '0' && *x <= '9') {x++; continue;}
        else if(*x == '.' || *x == '-' || *x == '_' || *x == '=') {x++; continue;}
        else if(*x == ':' || *x == '/' || *x == '?' || *x == '&') {x++; continue;}
        else {break;}
    }
    if(x) *d = x;
    else *d = 0;
}

/**************************************************************
功能:为 myanchor 分配 len 大小的内存
***************************************************************/
void GetMemory(char ** myanchor, int len)
{
    if(!(*myanchor))
        (*myanchor) = (char *)malloc(len + 1);
    else
        (*myanchor) = (char *)realloc((void *)(*myanchor), len + 1);
    memset((*myanchor), 0, len + 1);
}

/**************************************************************
功能:从 src 中分析出网页链接,并加入到当前节点的子节点上
***************************************************************/
void GetLink(char * src)
{
    char * pa, * pb, * pc;
    char * myanchor = 0;
    int len = 0;

    pa = src;
    do
    {
        if((pb = strstr(pa, "href='")))
        {
            pc = strchr(pb + 6, '\'');
            len = strlen(pb + 6) - strlen(pc);
            GetMemory(&myanchor, len);
            memcpy(myanchor, pb + 6, len);
        }
        else if((pb = strstr(pa, "href=\"")))
        {
            pc = strchr(pb + 6, '"');
            len = strlen(pb + 6) - strlen(pc);
            GetMemory(&myanchor, len);
            memcpy(myanchor, pb + 6, len);
        }
        else if((pb = strstr(pa, "href=")))
        {
            GetAfterPosWithSlash(pb + 5, &pc);
            len = strlen(pb + 5) - strlen(pc);
            GetMemory(&myanchor, len);
            memcpy(myanchor, pb + 5, len);
        }
        else {goto __returnLink ;}

        if(strlen(myanchor) > 0)
            AddChildNode(NodeCurr, myanchor);
        if(pc + 1)
            pa = pc + 1;
    }while(pa);
__returnLink:
    return;
}

/**************************************************************
功能:为当前节点增加子节点
***************************************************************/
void AddChildNode(WEBNODE * node, char * src)
{
    int WebPort, len;
    char * WebHost = 0, * PageAddress = 0, * WebDir = 0, * pC = 0;
    WEBNODE * NewNode;
    char filename[MAXFILENAME + 1] = "";
    char IsFromRoot = 0;

    if(!src)        return;
    if(!strncasecmp(src, "mailto:", strlen("mailto:")))        return ;
    if(strstr(src, ".css"))        return;
    if(strstr(src, ".xml"))        return;
    if(strstr(src, ".ico"))        return;
    if(strstr(src, ".jpg"))        return;
    if(strstr(src, ".gif"))        return;
    if(strstr(src, "javascript:"))        return;
    if(strstr(src, "+"))        return;

    ret = GetHost(src, &WebHost, &PageAddress, &WebPort, &WebDir);
    if(ret)
    {
        len = strlen(node->host);
        GetMemory(&WebHost, len);
        strcpy(WebHost, node->host);

        WebPort = node->port;

        IsFromRoot = !strncmp(src, "/", 1);
        if(IsFromRoot && (src + 1))
            Rstrchr(src + 1, '/', &pC);
        else if(!IsFromRoot)
            Rstrchr(src, '/', &pC);
        else
            pC = 0;

        if(pC)
        {
            if(IsFromRoot)
                len = strlen(src + 1) - strlen(pC);
            else
                len = strlen(src) - strlen(pC) + strlen(node->dir) + 1;
            GetMemory(&WebDir, len);

            if(IsFromRoot)
                memcpy(WebDir, src + 1, len);
            else
            {
                memcpy(WebDir, node->dir, strlen(node->dir));
                strcat(WebDir, "/");
                memcpy(WebDir + strlen(node->dir) + 1, src, strlen(src) - strlen(pC));
            }

            if(pC + 1)
            {
                len = strlen(pC + 1);
                GetMemory(&PageAddress, len);
                strcpy(PageAddress, pC + 1);
            }
            else
            {
                len = 1;
                GetMemory(&PageAddress, len);
                memcpy(PageAddress, e, len);
            }
        }
        else
        {
            if(IsFromRoot)
            {
                len = 1;
                GetMemory(&WebDir, len);
                memcpy(WebDir, e + 1, len);

                len = strlen(src + 1);
                GetMemory(&PageAddress, len);
                memcpy(PageAddress, src + 1, len);
            }
            else
            {
                len = strlen(node->dir);
                GetMemory(&WebDir, len);
                memcpy(WebDir, node->dir, len);

                len = strlen(src);
                GetMemory(&PageAddress, len);
                memcpy(PageAddress, src, len);
            }
        }
    }
    ret = IsExistWeb(NodeHeader, WebHost, PageAddress, WebPort, WebDir);
    if(ret) goto __ReturnAdd;

    if(node->child == NULL)
        NewNode = node->child = (WEBNODE *)malloc(sizeof(WEBNODE));
    else
        NodeTail->brother = NewNode = (WEBNODE *)malloc(sizeof(WEBNODE));

    memset(NewNode, 0, sizeof(WEBNODE));
    NewNode->host = (char *)malloc(strlen(WebHost) + 1);
    memset(NewNode->host, 0, strlen(WebHost) + 1);
    NewNode->page = (char *)malloc(strlen(PageAddress) + 1);
    memset(NewNode->page, 0, strlen(PageAddress) + 1);
    NewNode->dir = (char *)malloc(strlen(WebDir) + 1);
    memset(NewNode->dir, 0, strlen(WebDir) + 1);
    NewNode->file = (char *)malloc(MAXFILENAME + 1);
    memset(NewNode->file, 0, MAXFILENAME + 1);
    strcpy(NewNode->host, WebHost);
    strcpy(NewNode->page, PageAddress);
    strcpy(NewNode->dir, WebDir);
    sprintf(filename, "file%05d.html", FileNumber++);
    strcpy(NewNode->file, filename);
    NewNode->port = WebPort;
    NewNode->IsHandled = 0;
    NewNode->brother = 0;
    NewNode->child = 0;
    NodeTail = NewNode;

__ReturnAdd:
    free(WebHost);
    free(PageAddress);
    free(WebDir);
}

/**************************************************************
功能:检查是否已经处理过的网页
***************************************************************/
int IsExistWeb(WEBNODE * node, char * host, char * page, int port, char * dir)
{
    WEBNODE * t;
    t = node;

    while(t)
    {
        if(!strcmp(t->host, host) && !strcmp(t->page, page) && t->port == port && !strcmp(t->dir, dir)) return 1;
        t = t->brother;
    }
    t = node;

    while(t)
    {
        if(t->child)
        {
            ret = IsExistWeb(t->child, host, page, port, dir);
            if(ret)
                return 2;
        }
        t = t->brother;
    }
    return 0;
}
2016-09-05 22:17
诸葛欧阳
Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19Rank: 19
来 自:流年
等 级:贵宾
威 望:82
帖 子:2790
专家分:14619
注 册:2014-10-16
收藏
得分:4 
真长

一片落叶掉进了回忆的流年。
2016-09-06 09:06
luhaocheng
Rank: 1
等 级:新手上路
帖 子:1
专家分:4
注 册:2016-9-6
收藏
得分:4 
学习了,好厉害
2016-09-06 23:14
海贼王杰克
Rank: 1
等 级:新手上路
帖 子:2
专家分:8
注 册:2016-9-7
收藏
得分:0 
这个代码有错误呀
2016-09-07 15:38
快速回复:用c写一个爬虫软件
数据加载中...
 
   



关于我们 | 广告合作 | 编程中国 | 清除Cookies | TOP | 手机版

编程中国 版权所有,并保留所有权利。
Powered by Discuz, Processed in 0.023814 second(s), 10 queries.
Copyright©2004-2024, BCCN.NET, All Rights Reserved