티스토리 뷰

Python/Note

selenium 안드로이드 앱 크롤링

j0n9m1n1 j0n9m1n1 2019. 2. 13. 16:50
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
'''
pip install selenium
pip install requests
chromedriver 필요

용량제한X
카테고리 별 분류 필요
app 들의 urls 수집 후 
for each list 그리고 그 url에 있는 카테고리 
폴더가 없으면 생성 후 다운로드
있으면 거기에 다운로드
킹론상 25페이지 20개 = 500개 다운로드
'''
import os, sys, time
import requests
from selenium import webdriver
 
page = 25
in_page = 20
sleep_time = 30
chromePath = 'C:/Users/hacke/Desktop/for_reaLife/chromedriver'
download_base = 'C:/Users/hacke/Desktop/for_reaLife/crawling_app/'
 
chromeOptions = webdriver.ChromeOptions()
driver = webdriver.Chrome(chromePath)
 
list_links = [[""* in_page for i in range(page)]
# list_category = [[""] * in_page for i in range(page)]
# list_filename = [[""] * in_page for i in range(page)]
 
for i in range(1, page + 1): # for i in range(1, page + 1):
   driver.get('https://apk.support/apps-updated/?f_page='+str(i))
   for j in range(1, in_page + 1):
      if i is 1:
         attr = driver.find_element_by_xpath('/html/body/div[5]/div[2]/ul/li['+str(j)+']/dl/a')
      else:
         attr = driver.find_element_by_xpath('/html/body/div[6]/div[2]/ul/li['+str(j)+']/dl/a')
      href = attr.get_attribute('href')
      list_links[i - 1][j - 1= href.replace("app""download-app"1)
 
# driver.quit()
 
for i in range(page):
   for j in range(in_page):
      driver.get(list_links[i][j])
      category = str(driver.find_element_by_xpath('/html/body/div[4]/div[1]/a[3]').text)
      # list_category[i][j] = category
      
      check_dir = os.listdir('crawling_app/')
      if category not in check_dir:
         os.mkdir('crawling_app/'+category)
      else:
         pass
      #self dynamic download path zz
      driver.quit()
      prefs = {"download.default_directory" : download_base + category}
      chromeOptions.add_experimental_option("prefs",prefs)
      driver = webdriver.Chrome(chromePath, 0, chrome_options=chromeOptions)
      driver.get(list_links[i][j])
      driver.implicitly_wait(5)
      print(i, j)
 
      try:
         driver.find_element_by_xpath('/html/body/div[5]/div[1]/div[1]/a').click()
         time.sleep(sleep_time)
      except:
         try:
            driver.find_element_by_xpath('/html/body/div[5]/div[4]/a').click()
            time.sleep(sleep_time)
         except:
            try:
               driver.find_element_by_xpath('/html/body/div[5]/div[2]/a').click()
               time.sleep(sleep_time)
            except:
               pass
               
# downloadPath = ~~~
# fileDestination = downloadPath+newFileName+fileExtension
# os.rename(currentFile, fileDestination)
 
print(list_links)
time.sleep(10)
 
 
 
 
cs


예외처리도 부족하고 급하게 만들었던 것이라 참고만 해주세요