Linux獲取網頁源碼的幾種方法 - 遺世之都 - ITeye技術網站
JavaEye博客還是本科做畢業設計時候開通的,基本上荒廢了,現在決定記錄下平時編程遇到的問題或者解決方案。
第一個為利用linux下的工具來獲取網頁源碼,我用的是Wget,也可以使用Curl,curl的話更加的靈活,可以設置很多參數
- //通過Wget來獲取網頁 ??
- string?GetHtmlByWget(string?url)??
- {??
- ???? //獲取待下載網頁文件名 ??
- ????string?fileName?=?url.substr(( int )url.find_last_of( "/" )?+?1);??
- ???? if (fileName?!=? "" )??
- ????{??
- ????????string?strCom?=? "wget?-q?" ;? //wget命令,-q表示不顯示下載信息 ??
- ????????strCom.append(url);??
- ????????system(strCom.c_str());? //執行wget ??
- ??
- ????????ifstream?fin(fileName.c_str());??
- ???????? if (!fin)??
- ????????{??
- ???????????? return ? "" ;??
- ????????}??
- ????????string?strHtml?=? "" ;??
- ???????? char ?chTemp[1024]?=? "" ;??
- ???????? //讀取網頁文件到內存中 ??
- ???????? while (fin.getline(chTemp?,?1024))??
- ????????{??
- ????????????strHtml.append(string(chTemp));??
- ????????????strcpy(chTemp?,? "" );??
- ????????}??
- ????????fin.close();??
- ????????strCom?=? "rm?-f?" ;?? //刪除文件命令,-f表示直接刪除不做任何提示 ??
- ????????strCom.append(fileName);??
- ????????system(strCom.c_str());? //刪除剛才下載下來的文件 ??
- ???????? return ?strHtml;? //返回網頁源碼 ??
- ????}??
- ???? else ??
- ????{??
- ???????? return ? "" ;??
- ????}??
- }??
第二個是用的socket的來獲取源碼
- //通過GET獲取網頁源碼 ??
- string?GetHtmlByGet(string?url)??
- {??
- ????string?strHtmlContent?=? "" ;??
- ???? int ?sockfd;??
- ???? struct ?sockaddr_in?addr;??
- ???? struct ?hostent?*pURL;??
- ???? char ?text[RECVBUF];??
- ??
- ???? //分析鏈接 ??
- ????UrlInfo?urlInfo?=?ParseURL(url);??
- ????string?sAccept?=? "Accept:?*/*\r\nAccept-Language:?zh-cn\r\nAccept-Encoding:?gzip,?deflate" ;??
- ???? //不同的主機UserAgent不同 ??
- ????string?sUserAgent?=? "Mozilla/5.0?(X11;?U;?Linux?i686;?en-US)?AppleWebKit/534.10?(KHTML,?like?Gecko)?Chrome/8.0.552.224?Safari/534.10" ;??
- ???? //將端口轉換為字符串 ??
- ???? char ?t[6];??
- ????string??strPort;??
- ????sprintf(t, "%d" ,?urlInfo.Port);??
- ????strPort?=?t;??
- ???? //構造發送字符串 ??
- ????string?strRequest?=? "" ;??
- ????strRequest.append( "GET?" );??
- ????strRequest.append(urlInfo.File);??
- ????strRequest.append( "?" );??
- ????strRequest.append(urlInfo.Body);??
- ????strRequest.append( "?HTTP/1.1\r\n" );??
- ????strRequest.append(sAccept);??
- ????strRequest.append( "\r\nUser-Agent:" );??
- ????strRequest.append(sUserAgent);??
- ????strRequest.append( "\r\nHost:" );??
- ????strRequest.append(urlInfo.Host);??
- ????strRequest.append( ":" );??
- ????strRequest.append(strPort);??
- ????strRequest.append( "\r\nConnection:?Keep-Alive\r\n\r\n" );??
- ??
- ???? char *?host?=? const_cast < char *>(urlInfo.Host.c_str());??
- ????sockfd?=?socket(AF_INET,?SOCK_STREAM,?IPPROTO_TCP);? //TCP方式發送 ??
- ????pURL?=?gethostbyname(host);??
- ????addr.sin_family?=?AF_INET;??
- ????addr.sin_addr.s_addr?=?*((unsigned? long *)pURL->h_addr);??
- ????addr.sin_port?=?htons(80);??
- ??
- ???? //連接 ??
- ????connect(sockfd,( struct ?sockaddr?*)&addr, sizeof (addr));??
- ???? //發送 ??
- ????send(sockfd,? const_cast < char *>(strRequest.c_str()),?strRequest.length(),?0);??
- ???? //接受 ??
- ???? while (recv(sockfd,?text,?RECVBUF,?0)?>?0)??
- ????{??
- ????????strHtmlContent.append(text);??
- ????????bzero(text,RECVBUF);??
- ????}??
- ???? //關閉socket ??
- ????close(sockfd);??
- ???? //返回接受結果 ??
- ???? return ?strHtmlContent;??
- }??
使用libcurl
- #include?<stdio.h>???
- ?#include?<string.h>???
- ?#include?<curl/curl.h>???
- ??
- ?#define?MAX_BUF????? 65536 ???
- ??
- ? char ?wr_buf[MAX_BUF+ 1 ];???
- ? int ??wr_index;???
- ??
- ? /*? ?
- ?*?Write?data?callback?function?(called?within?the?context?of? ?
- ?*?curl_easy_perform.? ?
- ?*/ ???
- ?size_t?write_data(? void ?*buffer,?size_t?size,?size_t?nmemb,? void ?*userp?)???
- ?{???
- ?? int ?segsize?=?size?*?nmemb;???
- ??
- ?? /*?Check?to?see?if?this?data?exceeds?the?size?of?our?buffer.?If?so,? ?
- ???*?set?the?user-defined?context?value?and?return?0?to?indicate?a? ?
- ???*?problem?to?curl.? ?
- ???*/ ???
- ?? if ?(?wr_index?+?segsize?>?MAX_BUF?)?{???
- ????*( int ?*)userp?=? 1 ;???
- ???? return ? 0 ;???
- ??}???
- ??
- ?? /*?Copy?the?data?from?the?curl?buffer?into?our?buffer?*/ ???
- ??memcpy(?( void ?*)&wr_buf[wr_index],?buffer,?(size_t)segsize?);???
- ??
- ?? /*?Update?the?write?index?*/ ???
- ??wr_index?+=?segsize;???
- ??
- ?? /*?Null?terminate?the?buffer?*/ ???
- ??wr_buf[wr_index]?=? 0 ;???
- ??
- ?? /*?Return?the?number?of?bytes?received,?indicating?to?curl?that?all?is?okay?*/ ???
- ?? return ?segsize;???
- ?}???
- ??
- ??
- ? /*? ?
- ?*?Simple?curl?application?to?read?the?index.html?file?from?a?Web?site.? ?
- ?*/ ???
- ? int ?main(? void ?)???
- ?{???
- ??CURL?*curl;???
- ??CURLcode?ret;???
- ?? int ??wr_error;???
- ??
- ??wr_error?=? 0 ;???
- ??wr_index?=? 0 ;???
- ??
- ?? /*?First?step,?init?curl?*/ ???
- ??curl?=?curl_easy_init();???
- ?? if ?(!curl)?{???
- ????printf( "couldn't?init?curl\n" );???
- ???? return ? 0 ;???
- ??}???
- ??
- ?? /*?Tell?curl?the?URL?of?the?file?we're?going?to?retrieve?*/ ???
- ??curl_easy_setopt(?curl,?CURLOPT_URL,? "www.exampledomain.com" ?);???
- ??
- ?? /*?Tell?curl?that?we'll?receive?data?to?the?function?write_data,?and? ?
- ???*?also?provide?it?with?a?context?pointer?for?our?error?return.? ?
- ???*/ ???
- ??curl_easy_setopt(?curl,?CURLOPT_WRITEDATA,?( void ?*)&wr_error?);???
- ??curl_easy_setopt(?curl,?CURLOPT_WRITEFUNCTION,?write_data?);???
- ??
- ?? /*?Allow?curl?to?perform?the?action?*/ ???
- ??ret?=?curl_easy_perform(?curl?);???
- ??
- ??printf(? "ret?=?%d?(write_error?=?%d)\n" ,?ret,?wr_error?);???
- ??
- ?? /*?Emit?the?page?if?curl?indicates?that?no?errors?occurred?*/ ???
- ?? if ?(?ret?==? 0 ?)?printf(? "%s\n" ,?wr_buf?);???
- ??
- ??curl_easy_cleanup(?curl?);???
- ??
- ?? return ? 0 ;???
- ?}???
更多文章、技術交流、商務合作、聯系博主
微信掃碼或搜索:z360901061

微信掃一掃加我為好友
QQ號聯系: 360901061
您的支持是博主寫作最大的動力,如果您喜歡我的文章,感覺我的文章對您有幫助,請用微信掃描下面二維碼支持博主2元、5元、10元、20元等您想捐的金額吧,狠狠點擊下面給點支持吧,站長非常感激您!手機微信長按不能支付解決辦法:請將微信支付二維碼保存到相冊,切換到微信,然后點擊微信右上角掃一掃功能,選擇支付二維碼完成支付。
【本文對您有幫助就好】元
