1. Python / Говнокод #23172

    0

    1. 01
    2. 02
    3. 03
    4. 04
    5. 05
    6. 06
    7. 07
    8. 08
    9. 09
    10. 10
    11. 11
    12. 12
    13. 13
    14. 14
    15. 15
    16. 16
    17. 17
    18. 18
    19. 19
    20. 20
    21. 21
    22. 22
    23. 23
    24. 24
    25. 25
    26. 26
    27. 27
    28. 28
    29. 29
    30. 30
    31. 31
    32. 32
    33. 33
    34. 34
    35. 35
    36. 36
    37. 37
    38. 38
    39. 39
    40. 40
    41. 41
    42. 42
    43. 43
    44. 44
    45. 45
    46. 46
    47. 47
    48. 48
    49. 49
    50. 50
    51. 51
    52. 52
    53. 53
    54. 54
    55. 55
    56. 56
    57. 57
    58. 58
    59. 59
    60. 60
    import csv
    import json
    
    import requests
    
    from io import StringIO
    
    from lxml import etree
    
    
    class DayPicParser(object):
        url = 'урл_до_сайта'
    
        def get_info(self):
            handler = open('daypic.csv', 'w+')
            page = 1
            rows = []
            tags = set()
            while page <= 1077:
                response = requests.get('{}/page/{}'.format(self.url, page))
                response.connection.close()
                parser = etree.HTMLParser()
                tree = etree.parse(StringIO(response.text), parser)
                posts = tree.xpath('//div[@class="posts"]/div[@class="post"]')
                for i, post in enumerate(posts):
                    title = post.find('./h2/a')
                    link_to_fulltext = title.attrib['href']
    
                    short_text = post.find('./div[@class="text"]//p[1]')
                    response = requests.get(link_to_fulltext)
                    full_parser = etree.HTMLParser()
                    full_tree = etree.parse(StringIO(response.text), full_parser)
                    full_post = full_tree.xpath('//div[@class="text"]//p')
    
                    main_image = None
                    full_text = []
                    for inner in full_post:
                        if inner.find('.//noindex') is not None:
                            continue
                        if inner.find('.//img') is not None:
                            src = inner.find('.//img').attrib['src']
                            if not main_image:
                                main_image = src
                            full_text.append({'image': src})
                        for paragraph in etree.tostring(inner, method="text", encoding='utf-8').decode('utf-8').strip().split("\n"):
                            if paragraph:
                                full_text.append({'text': paragraph})
                    rows.append({'title': title.text.strip(),
                                 'preview': main_image if main_image else '',
                                 'short_text': short_text.text.strip(),
                                 'full_text': full_text})
                    print('Post {} of {}'.format(i, len(posts)))
                page += 1
                print(page)
            handler.write(json.dumps(rows))
            handler.close()
            print(tags)
    
    
    DayPicParser().get_info()

    Запостил: storvus, 10 Июля 2017

    Комментарии (4) RSS

    Добавить комментарий