回复 39楼 zaixuexi
谢谢Z版赐教,继续学习你改进的代码。
梅尚程荀
马谭杨奚
#include <stdio.h> #include <winsock.h> #include <string.h> #include <afxinet.h> #include <algorithm> #include <afxdhtml.h> #include <vector> #pragma comment(lib, "ws2_32.lib") CString geturl(char *url) { CString strHtml; WSADATA WSAData={0}; SOCKET sockfd; struct sockaddr_in addr; struct hostent *pURL; char myurl[BUFSIZ]; char *pHost = 0, *pGET = 0; char host[BUFSIZ], GET[BUFSIZ]; char header[BUFSIZ] = ""; static char text[BUFSIZ]; int i; /* * windows下使用socket必须用WSAStartup初始化,否则不能调用 */ if(WSAStartup(MAKEWORD(2,2), &WSAData)) { printf("WSA failed\n"); return strHtml; } /* * 分离url中的主机地址和相对路径 */ strcpy(myurl, url); for (pHost = myurl; *pHost != '/' && *pHost != '\0'; ++pHost); if ( (int)(pHost - myurl) == strlen(myurl) ) strcpy(GET, "/"); else strcpy(GET, pHost); *pHost = '\0'; strcpy(host, myurl); //printf("%s\n%s\n", host, GET); /* * 设定socket参数,并未真正初始化 */ sockfd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); pURL = gethostbyname(host); addr.sin_family = AF_INET; addr.sin_addr.s_addr = *((unsigned long*)pURL->h_addr); addr.sin_port = htons(80); /* * 组织发送到web服务器的信息 * 为何要发送下面的信息请参考HTTP协议的约定 */ strcat(header, "GET "); strcat(header, GET); strcat(header, " HTTP/1.1\r\n"); strcat(header, "HOST: "); strcat(header, host); strcat(header, "\r\nConnection: Close\r\n\r\n"); /* * 连接到服务器,发送请求header,并接受反馈(即网页源代码) */ connect(sockfd,(SOCKADDR *)&addr,sizeof(addr)); send(sockfd, header, strlen(header), 0); while ( recv(sockfd, text, BUFSIZ, 0) > 0) { //printf("%s", text); strHtml += text; strnset(text, '\0', BUFSIZ); } closesocket(sockfd); WSACleanup(); return strHtml; } // 唯一的应用程序对象 CWinApp theApp; using namespace std; CString GetElementAttr(CComPtr<IHTMLElement> sp, CString strAttr) { VARIANT var; BSTR bsAttr = strAttr.AllocSysString(); HRESULT hr = S_OK; hr = sp->getAttribute(bsAttr, 0, &var); ::SysFreeString(bsAttr); if (hr == S_OK && var.vt != VT_NULL) return CString(var.bstrVal); else return CString(""); } CString GetElementInnerHTML(CComPtr<IHTMLElement> sp) { BSTR bsHtml; sp->get_innerHTML(&bsHtml); return CString(bsHtml); } CString GetElementInnerText(CComPtr<IHTMLElement> sp) { BSTR bsText; sp->get_innerText(&bsText); return CString(bsText); } CString GetElementClassName(CComPtr<IHTMLElement> sp) { BSTR bsClass; sp->get_className(&bsClass); return CString(bsClass); } BOOL GetElementByCollection(CComPtr<IHTMLElementCollection> elementCollection, int nIndex, void** ppElem) { HRESULT hr = S_OK; IDispatch *pDispInputText = NULL; CComVariant vIndex=nIndex; elementCollection->item(vIndex,vIndex,&pDispInputText); hr=pDispInputText->QueryInterface(IID_IHTMLElement,ppElem); return hr == S_OK; } CString GetElementTagName(CComPtr<IHTMLElement> sp) { BSTR bsAttr; sp->get_tagName(&bsAttr); return CString(bsAttr); } CString GetElementId(CComPtr<IHTMLElement> sp) { BSTR bsId; sp->get_id(&bsId); return CString(bsId); } void GetTopDocumentFromUrl(CString url,CComQIPtr<IHTMLDocument2>& pDoc) { CoInitialize(NULL); CString strHtml = geturl(url.GetBuffer()); BSTR bs = strHtml.AllocSysString(); HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc.p); SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT,0,1); VARIANT* param; hr = SafeArrayAccessData(psa, (LPVOID*)¶m); param->vt = VT_BSTR; param->bstrVal = bs; pDoc->write(psa); ::SysFreeString(bs); } struct ShiCi { CString strName; CString strWriter; CString strContext; }; BOOL GetShiCiFromUrl(CString strUrl, ShiCi & sc) { HRESULT hr = S_OK; CComQIPtr<IHTMLDocument2> pDoc; GetTopDocumentFromUrl(strUrl, pDoc); CComQIPtr<IHTMLElement> spTop; pDoc->get_body(&spTop); CComQIPtr<IHTMLElementCollection> spCol; CComPtr<IDispatch> pDispatch; hr = spTop->get_all(&pDispatch); hr = pDispatch->QueryInterface(IID_IHTMLElementCollection, (VOID**)&spCol); long len; spCol->get_length(&len); int num = 0; for (int c=0; c<len; c++) { CComPtr<IHTMLElement> spP1; if (GetElementByCollection(spCol, c, (void**)&spP1) && GetElementClassName(spP1) == "HeightBorderCenter") { if (num == 0) { sc.strName = GetElementInnerText(spP1); } else if (num == 1) { sc.strWriter = GetElementInnerText(spP1); } else if (num == 2) { sc.strContext = GetElementInnerText(spP1); } num ++; } } if (num < 3) return FALSE; return TRUE; } int _tmain(int argc, TCHAR* argv[], TCHAR* envp[]) { int nRetCode = 0; // 初始化 MFC 并在失败时显示错误 if (!AfxWinInit(::GetModuleHandle(NULL), NULL, ::GetCommandLine(), 0)) { // TODO: 更改错误代码以符合您的需要 _tprintf(_T("错误: MFC 初始化失败\n")); nRetCode = 1; } else { // TODO: 在此处为应用程序的行为编写代码。 } std::vector<ShiCi> arrShiCi; int nStart = 101; int nEnd = 140; for (int i=nStart; i<nEnd; i++) { ShiCi sc; CString strUrl; strUrl.Format("www.", i); if (GetShiCiFromUrl(strUrl, sc)) arrShiCi.push_back(sc); system("cls"); printf("共%d个,已经完成%d个,获取%d个诗词\n", nEnd-nStart, i-nStart+1, arrShiCi.size()); ////////////////////////////////////////////////////////////////////////// // 这么保存保险一点 FILE* fp = fopen("out.xml", "wt"); if (fp == NULL) return 0; fprintf(fp, "<?xml version=\"1.0\" encoding=\"gb2312\"?>\n"); fprintf(fp, "<root>\n"); for (int j=0; j<arrShiCi.size(); j++) { fprintf(fp, "<sc%d>\n", j); fprintf(fp, "<Name>\n"); fprintf(fp, "%s\n", arrShiCi[j].strName); fprintf(fp, "</Name>\n"); fprintf(fp, "<Writer>\n"); fprintf(fp, "%s\n", arrShiCi[j].strWriter); fprintf(fp, "</Writer>\n"); fprintf(fp, "<Context>\n"); fprintf(fp, "%s\n", arrShiCi[j].strContext); fprintf(fp, "</Context>\n"); fprintf(fp, "</sc%d>\n", j); } fprintf(fp, "</root>\n"); fclose(fp); } return nRetCode; }