当前位置:主页 > PHP开发 > PHP基础 >

php爬取天猫和淘宝商品数据

栏目:PHP基础 来源: 微信小程序开发网 阅读: 2018-03-26

最近做了一个网站用到了从网址爬取天猫和淘宝的商品信息,首先看了下手机端的网页发现用的react,不太了解没法搞,所以就考虑从PC入口爬取数据,但是当爬取URL获取数据时并没有获取价格,库存等的信息,仔细研究了下发现是异步请求了另一个接口,但是接口要使用refer才能获取数据,于是就通过以下方式写了一个简单的爬虫,用于爬取商品预览图和商品的第一个分类的价格、库存等。

二、实现

代码如下:

function crawlUrl($$url){ import('PhpQuery.Curl'); $$curl=new \Curl(); $$result = $$curl->read($$url); $$content = mb_convert_encoding( $$result['content'], 'UTF-8', 'UTF-8,GBK,GB2312,BIG5' ); $$myres=array(); if(strrpos($$url,'taobao.com')!=false) { //匹配是否下架 if(strpos($$content,'此宝贝已下架')!==false){ return false; } preg_match("|itemId : '(.*)'|isU", $$content, $$match); $$item_id=$$match[1]; preg_match("|sellerId : '(.*)'|isU", $$content, $$match); $$sellet_id=$$match[1]; preg_match("|<title>(.*)</title>|isU",$$content,$$match); $$title=$$match[1]; //价格库存信息 $$ch = curl_init(); curl_setopt ($$ch, CURLOPT_URL, 'https://detailskip.taobao.com/service/getData/1/p1/item/detail/sib.htm?itemId='.$$item_id.'&sellerId='.$$sellet_id.'&modules=dynStock,qrcode,viewer,price,duty,xmpPromotion,delivery,upp,activity,fqg,zjys,amountRestriction,couponActivity,soldQuantity,originalPrice,tradeContract&callback=onSibRequestSuccess'); $$opt[CURLOPT_HEADER]=false; $$opt[CURLOPT_CONNECTTIMEOUT]=15; $$opt[CURLOPT_TIMEOUT]=300; $$opt[CURLOPT_AUTOREFERER]=true; $$opt[CURLOPT_USERAGENT]='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11'; curl_setopt_array($$ch,$$opt); curl_setopt ($$ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($$ch,CURLOPT_REFERER,$$url); curl_setopt($$ch, CURLOPT_SSL_VERIFYPEER, false); $$out_put=curl_exec ($$ch); curl_close ($$ch); $$res=str_replace('onSibRequestSuccess(',"",$$out_put); $$res=rtrim($$res,');1'); $$result=json_decode($$res,true); //查询出图片信息 preg_match('|<ul>(.*)</ul>|isU', $$content, $$match); preg_match_all('/<img data-src="(.*?)" \//', $$match[1], $$images); $$myres['title']=str_replace('-淘宝网','',$$title); $$myres['price']=current($$result['data']['originalPrice']); $$myres['act_price']=current($$result['data']['promotion']['promoData']); $$myres['stock']=$$result['data']['dynStock']['stock']; $$myres['banners']=$$images[1]; }else{ //匹配是否下架 if(strpos($$content,'此宝贝已下架')!==false){ return false; } $$start=strpos($$url,'&id='); $$item_id=substr($$url,$$start+4,12); if(!is_numeric($$item_id)){ $$start=strpos($$url,'?id='); $$end=strpos($$url,'&spm'); $$item_id=substr($$url,$$start+4,$$end-$$start-4); } preg_match("|<title>(.*)</title>|isU",$$content,$$match); $$title=$$match[1]; $$myurl='https://mdskip.taobao.com/core/initItemDetail.htm?cachedTimestamp=1500562177777&queryMemberRight=true&cartEnable=true&offlineShop=false&addressLevel=2&itemId='.$$item_id.'&tryBeforeBuy=false&isAreaSell=false&tmallBuySupport=true&isPurchaseMallPage=false&household=false&isForbidBuyItem=false&service3C=false&isRegionLevel=false&showShopProm=false&isSecKill=false&sellerPreview=false&isUseInventoryCenter=false&isApparel=true&callback=setMdskip×tamp=1500562172109&isg=AiUlDZFWmP/sMgVurQSILU3Ytet/Zdis&isg2=Ajk51JIhRFqKzxmiNPP6dkYxSKXT7iySkzSTeVtu9WDf4ll0o5Y9yKdyEtHu'; //价格库存信息 $$ch = curl_init(); curl_setopt ($$ch, CURLOPT_URL, $$myurl); $$opt[CURLOPT_HEADER]=false; $$opt[CURLOPT_CONNECTTIMEOUT]=15; $$opt[CURLOPT_TIMEOUT]=300; $$opt[CURLOPT_AUTOREFERER]=true; $$opt[CURLOPT_USERAGENT]='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.47 Safari/536.11'; curl_setopt_array($$ch,$$opt); curl_setopt ($$ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($$ch,CURLOPT_REFERER,$$url); curl_setopt($$ch, CURLOPT_SSL_VERIFYPEER, false); $$out_put=curl_exec ($$ch); curl_close ($$ch); $$res = mb_convert_encoding( $$out_put, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5' ); $$res=str_replace('setMdskip',"",$$res); $$res=str_replace('(',"",$$res); $$res=str_replace(')',"",$$res); $$result=json_decode($$res,true); $$nowk=""; $$nowstore=""; foreach($$result['defaultModel']['inventoryDO']['skuQuantity'] as $$k=>$$val){ $$nowk=$$k; $$nowstore=$$val; break; } $$myres['title']=str_replace('-tmall.com天猫','',$$title); $$myres['price']=$$result['defaultModel']['itemPriceResultDO']['priceInfo'][$$nowk]['price']; $$myres['act_price']=isset($$result['defaultModel']['itemPriceResultDO']['priceInfo'][$$nowk]['suggestivePromotionList'])?$$result['defaultModel']['itemPriceResultDO']['priceInfo'][$$nowk]['suggestivePromotionList']:$$result['defaultModel']['itemPriceResultDO']['priceInfo'][$$nowk]; $$myres['stock']=$$result['defaultModel']['inventoryDO']['totalQuantity']?$$result['defaultModel']['inventoryDO']['totalQuantity']:$$nowstore['quantity']; //查询出图片信息 preg_match('|<ul>(.*)</ul>|isU',$$content, $$match); preg_match_all('/<img src="(.*?)" \//',$$match[1],$$images); $$myres['banners']=$$images[1]; } return $$myres; }

相关文章