如何请求某个网站下的html页面数据？

用C++编写一个程序，用于请求某个网站某个分类下的一个html页面数据
程序可以取到形如"www.sina.com"、"hall.91mq.com"网址的数据，但是无法请求到"www.sina.edu\cet4\cet4.html"、"hall.91mq.com\mg.defp1.html"的数据,请大家帮我看看，谢谢！#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>#pragma comment(lib, "ws2_32.lib")
#define DEFAULT_PAGE_BUF_SIZE 1048576using namespace std;
void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
{
return;
} // timer is start
clock_t start, finish;
double duration;
start = clock(); char host[] = "www.sina.com.cn";
char *request = "GET / HTTP/1.0\r\nHost: www.sina.com.cn\r\nConnection: Close\r\n\r\n"; struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
} // 获取域名对应的IP
struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4); int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr)); sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
} // send the "GET" data
ret = send(sock, request, strlen(request), 0); // 网页内容长度。可以从http头部数据中获取 "Content-Length:"
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE; char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength); int bytesRead = 0;
while(ret > 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0); if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0'; cout << bytesRead << endl; // write the html content to the file
ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl; ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup(); // timer is finish
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n"; return;
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

浏览器可以打开的.
勘误：hall.91mq.com\mg.defp1.html 改为 hall.91mq.com\mg\defp1.htm
www.sina.edu\cet4\cet4.html 无此网页，此处只是为了说明情况.
CString IdentifyValidate(CString URL)
{
    // 指向文件内容对象
    CInternetFile* pUrlFile=NULL;
    CString szContent = _T(""),szLine = _T(""); try
{
// 连接http服务器，并打开Url文件，开始对当前Section缓存文件内容
if ((pUrlFile=(CInternetFile*)m_session.OpenURL(URL))!=NULL)
{
while(pUrlFile->ReadString(szLine))
{
szContent += szLine;
szLine="";
} pUrlFile->Close();
delete pUrlFile;
pUrlFile = NULL;
                  }
}
catch(CInternetException* e)
{
char Err[200];
e->GetErrorMessage(Err,200);
m_Error.Format("%s",Err);
        e->Delete(); IsSuc = FALSE;
pUrlFile=NULL;
}
return szContent ;
}
不用啊，hall.91...是我们公司的网站，没有什么安全机制，我觉得是程序哪里处理错了...
这方法真原始，竟然用socket，把我项目里用的代码贴出来，保证无bug，比网页上找的强多了//MFC方式，这段代码只适合GB2312编码的网页，编译环境为UNICODE
CString GetHtmlContent(CString strUrl)
{
CInternetSession session;
CStdioFile *file = NULL;
strUrl.TrimLeft();
strUrl.TrimRight();
try
{
file = session.OpenURL(strUrl);
}
catch(CInternetException * m_pException)
{
file = NULL;
m_pException->m_dwError;
m_pException->Delete();
session.Close();
AfxMessageBox(L"CInternetException");
return L"";
} CString strLine;
char *pData = new char[1024*1024];
ZeroMemory( pData, 1024*1024 );
UINT readTotal = 0;
UINT nRead = 0;
if (file != NULL )
{
do
{
nRead = file->Read( pData+readTotal, 4096 );
readTotal += nRead;
} while (nRead > 0 );
}
else
{
session.Close();
delete pData;
return L"";
}
session.Close();
file->Close();
CString strHtml(pData);
delete pData;
return strHtml;
}//win32 api方式，编译环境为ANSI，支持UTF8和GB2312的网站
#include "stdafx.h"
#include "windows.h"
#include "Wininet.h"
#pragma comment(lib, "Wininet.lib")
#include <string>
using namespace std;enum Html_Type
{
CODE_UTF8,   //UTF8编码的网站
CODE_GB2312, //GB2312编码的网站
};string DownHtmlContent(const char* strUrl, Html_Type type )
{
string strContent;
HINTERNET hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if (hSession != NULL)
{
HINTERNET handle2 = InternetOpenUrl(hSession, strUrl, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
if (handle2 != NULL)
{
char *Temp = new char[1024*1024];
ZeroMemory(Temp, 1024*1024);
DWORD Number = 1;
DWORD Total = 0;
BOOL bRes = FALSE;
while ( Number > 0 )
{
if( InternetReadFile(handle2, Temp+Total, 2048, &Number) )
{
Total += Number;
}
}
if ( type == CODE_GB2312 )
{
strContent = Temp;
}
else if ( type == CODE_UTF8 )
{
wchar_t *pUnicode = new wchar_t[1024*1024];
char *pAnsi = new char[1024*1024];
ZeroMemory( pAnsi, 1024*1024 );
ZeroMemory( pUnicode, 1024*1024*2 );
if( MultiByteToWideChar( CP_UTF8, 0, Temp, Total, pUnicode, 1024*1024 ) != 0 )
{
WideCharToMultiByte( CP_ACP, 0, pUnicode, wcslen( pUnicode ), pAnsi, 1024*1024, "", NULL );
strContent = pAnsi;
}
delete []pAnsi;
delete []pUnicode;
}
delete []Temp;
InternetCloseHandle(handle2);
handle2 = NULL;
}
InternetCloseHandle(hSession);
hSession = NULL;
} return strContent;
}
int _tmain(int argc, _TCHAR* argv[])
{
string strUTF8Test = DownHtmlContent( "http://www.google.cn/webhp?source=g_cn", CODE_UTF8 );
string strGBTest = DownHtmlContent( "http://www.baidu.com", CODE_GB2312 );
return 0;
}
获取html网页数据
还是用wininet吧
楼上都给出代码了，不多说了