Crawler/Crawler_initial.py at master · manik005/Crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
__author__ = 'shruti'

# Crawler to crawl flipkart


from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import CrawlingItem
import json
import string

class FlipkartSpider(BaseSpider):
    name = "flipkart_spider"
    allow_domains = ["flipkart.com"]


    def start_requests(self):
        x = 1
        lines = [{
            "url": "http://www.flipkart.com/laptops/pr?sid=6bo,b5g&otracker=ch_vn_laptop_filter_Laptop%20Brands_All%20Brands",
            "count": "65"}]
        url = lines[0]['url']
        count = str(lines[0]['count'])
        count = int(count) + 1
        y = 1
        p = str(x)
        while x < count:
            # p=str(x)
            yield self.make_requests_from_url((url).format(p))
            x = x + 1
            p = int(x)
            p = x
            p = str(p)
            if x >= count:
                if y < 3:
                    url = lines[y]['url']
                    count = lines[y]['count']
                    count = int(count) + 1
                    y = y + 1
                    x = 1

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div[contains(@class,'product-unit unit-4 browse-product new-design')]")
        items = []
        count1 = 0
        for title in titles:
            count1 = count1 + 1
            item = CrawlingItem()
            item['model'] = title.select(".//div[contains(@class,'pu-title')]/a/text()").extract()
            item['offer'] = title.select(".//div[contains(@class,'pu-final')]/span/text()").extract()
            item['image'] = title.select(".//div[contains(@class,'pu-visual-section')]/a/img/@src").extract()
            item['standard_url'] = title.select(".//div[contains(@class,'pu-title')]/a/@href").extract()
            items.append(item)

        return items