在 python 中使用 selenium chrome webdriver 拉取 twitter 关注者数据?无法加载所有关注者

Pulling twitter followers data using selenium chrome webdriver in python? Not able to load all the followers

提问人:Ashish Verma 提问时间:4/2/2017 更新时间:11/25/2017 访问量:2311

问:

我正在尝试使用 Selenium chrome webdriver 和 BeautifulSoup 为拥有 80K 关注者的帐户提取 twitter 关注者数据。 我的脚本中面临两个问题:

1)在加载所有关注者后滚动到页面底部以获取整个页面源时,我的脚本不会一直滚动到底部。在加载随机数量的追随者后,它会停止滚动,然后开始遍历每个追随者个人资料以获取他们的数据。我希望它加载页面上的所有关注者,然后开始遍历个人资料。

2)我的第二个问题是每次运行脚本时,它都会尝试一个接一个地滚动到底部,直到加载所有关注者,然后通过一次解析一个关注者数据来开始拉取数据。在我的情况下,这将需要 4 到 5 天才能获取所有关注者数据(80K 关注者)。有没有更好的方法可以做到这一点。

这是我的剧本:

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
	os.environ["webdriver.chrome.driver"]=chromedriver
	chromeOptions = webdriver.ChromeOptions()
	prefs = {"download.default_directory" : download_path}
	chromeOptions.add_experimental_option("prefs",prefs)
	driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
	driver.implicitly_wait(20)
	driver.maximize_window()
except Exception as err:
	print "Error:Failed to open chrome."
	print "Error: ",err
	driver.stop_client()
	driver.close()
	
#opening the web page
try:
	driver.get('https://twitter.com/login')
except Exception as err:
	print "Error:Failed to open url."
	print "Error: ",err
	driver.stop_client()
	driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("###########")
password.send_keys("###########")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadserver/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)


for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()


os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")

python-2.7 python-3.x selenium beautifulsoup chrome-web-driver

评论


答:

0赞 innicoder 4/2/2017 #1

更好的方法。 使用 Twitter API,这是我找到的快速 Github 脚本 Github 脚本 对不起,你可能会觉得使用 Selenium 已经花了很多时间(不使用 API 有好处) 关于自动化和了解工作原理的好文章:Twitter API

有一种方法可以滚动很多次,但你必须做一些数学运算或设置一个条件来阻止这种情况。

driver.execute_script("window.scrollTo(0, 10000);") 

假设您有 10k 个关注者,初始显示 100个关注者,之后您将在每个滚动中加载 10 个关注者。您将再滚动 990 次。

当然,这是 alecxe :D 对您的案例的确切用法。Qudora* 回答 By - alecxe -

html = driver.page_source

一旦你揭示了所有的追随者(滚动),然后用BeautifulSoup之类的东西解析它,就可以使用这个.page_source

评论

0赞 Ashish Verma 4/3/2017
我们可以通过手动滚动加载所有关注者并将页面源保存在文本文件中,然后从文本文件中循环浏览所有关注者数据,而不是访问 twitter 的网站。我不知道这是否有效。如果是这样,那么您能否提供代码来执行此部分,因为我尝试这样做但没有成功。谢谢。
0赞 innicoder 4/4/2017
是的,有一个 selenium func > .page_source示例 html = driver.page_source
0赞 Ashish Verma 4/3/2017 #2

我没有按照 alecxe 在他的回答中提到的实现,但我的脚本仍然没有解析所有追随者。它仍在加载随机数量的追随者。似乎无法深入了解这一点。有人可以尝试在他们的终端上运行它,看看他们是否能够加载所有关注者。这是修改后的脚本:

from bs4 import BeautifulSoup
import sys
import os,re
import time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from os import listdir
from os.path import isfile, join

print "Running for chrome."

chromedriver=sys.argv[1]
download_path=sys.argv[2]
os.system('killall -9 "Google Chrome"')
try:
	os.environ["webdriver.chrome.driver"]=chromedriver
	chromeOptions = webdriver.ChromeOptions()
	prefs = {"download.default_directory" : download_path}
	chromeOptions.add_experimental_option("prefs",prefs)
	driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=chromeOptions)
	driver.implicitly_wait(20)
	driver.maximize_window()
except Exception as err:
	print "Error:Failed to open chrome."
	print "Error: ",err
	driver.stop_client()
	driver.close()
	
#opening the web page
try:
	driver.get('https://twitter.com/login')
except Exception as err:
	print "Error:Failed to open url."
	print "Error: ",err
	driver.stop_client()
	driver.close()

username = driver.find_element_by_xpath("//input[@name='session[username_or_email]' and @class='js-username-field email-input js-initial-focus']")
password = driver.find_element_by_xpath("//input[@name='session[password]' and @class='js-password-field']")

username.send_keys("*****************")
password.send_keys("*****************")
driver.find_element_by_xpath("//button[@type='submit']").click()
#os.system('killall -9 "Google Chrome"')
driver.get('https://twitter.com/sadoperator/followers')



followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

output=open('twitter_follower_sadoperator.csv','a')
output.write('Name,Twitter_Handle,Location,Bio,Join_Date,Link'+'\n')
div = soup.find('div',{'class':'GridTimeline-items has-items'})
bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
name_list=[]
lastHeight = driver.execute_script("return document.body.scrollHeight")

followers_link=driver.page_source  #follwer page 18at a time
soup=BeautifulSoup(followers_link,'html.parser')

followers_per_page = 18
followers_count = 15777


for _ in xrange(0, followers_count/followers_per_page + 1):
        driver.execute_script("window.scrollTo(0, 7755000);")
        time.sleep(2)
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
                followers_link=driver.page_source  #follwer page 18at a time
                soup=BeautifulSoup(followers_link,'html.parser')
                div = soup.find('div',{'class':'GridTimeline-items has-items'})
                bref = div.findAll('a',{'class':'ProfileCard-bg js-nav'})
                for name in bref:
                        name_list.append(name['href'])
                break
        lastHeight = newHeight
        followers_link=''

print len(name_list)

'''
for x in range(0,len(name_list)):
        #print name['href']
        #print name.text
        driver.stop_client()
        driver.get('https://twitter.com'+name_list[x])
        page_source=driver.page_source
        each_soup=BeautifulSoup(page_source,'html.parser')
        profile=each_soup.find('div',{'class':'ProfileHeaderCard'})
                            
        try:
                name = profile.find('h1',{'class':'ProfileHeaderCard-name'}).find('a').text
                if name:
                        output.write('"'+name.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in name:',e

        try:
                handle=profile.find('h2',{'class':'ProfileHeaderCard-screenname u-inlineBlock u-dir'}).text
                if handle:
                        output.write('"'+handle.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in handle:',e

        try:
                location = profile.find('div',{'class':'ProfileHeaderCard-location'}).text
                if location:
                        output.write('"'+location.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in location:',e

        try:
                bio=profile.find('p',{'class':'ProfileHeaderCard-bio u-dir'}).text
                if bio:
                        output.write('"'+bio.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in bio:',e
                        
        try:
                joinDate = profile.find('div',{'class':'ProfileHeaderCard-joinDate'}).text
                if joinDate:
                        output.write('"'+joinDate.strip().encode('utf-8')+'"'+',')
                else:
                        output.write(' '+',')
        except Exception as e:
                output.write(' '+',')
                print 'Error in joindate:',e
        
        try:
                url =  [check.find('a') for check in profile.find('div',{'class':'ProfileHeaderCard-url'}).findAll('span')][1]
                if url:
                        output.write('"'+url['href'].strip().encode('utf-8')+'"'+'\n')
                else:
                        output.write(' '+'\n')
        except Exception as e:
                output.write(' '+'\n')
                print 'Error in url:',e
        


        
output.close()
'''

os.system("kill -9 `ps -deaf | grep chrome | awk '{print $2}'`")

0赞 Michael Perelman 11/25/2017 #3
  1. 在 Firefox 或其他浏览器中,打开开发者控制台并写下(复制)在向下滚动页面时发生的请求 - 您将使用它来构建您的请求。请求将看起来像这样 - https://twitter.com/DiaryofaMadeMan/followers/users?include_available_features=1&include_entities=1&max_position=1584951385597824282&reset_error_state=false,并在 html 源中搜索 data-min-position 这样的 - data-min-position=“1584938620170076301”
  2. 使用 PhantomJS 加载 HTML - 使用 Beautifulsoup 解析。您需要获得第一部分关注者和“数据最小值”。将关注者保存到列表中,并将“data-min-position”保存为变量
  3. 使用在第 1 阶段请求中保存的请求和“data-min”来构造新请求 - 仅将请求的 data-max 数字替换为保存的 data-min
  4. 使用 python 请求(不再使用 webdriver)发送请求并接收 json 响应。
  5. 从响应 json 获取新的关注者和新的数据分钟
  6. 重复 2,3,4 直到 data-min=0

这种方式比 API 好得多,因为您可以不受任何限制地加载大量数据