Presentation is loading. Please wait.

Presentation is loading. Please wait.

CSCE 590 Web Scraping - Selenium

Similar presentations


Presentation on theme: "CSCE 590 Web Scraping - Selenium"— Presentation transcript:

1 CSCE 590 Web Scraping - Selenium
Topics Login to SelfServiceCarolina example Readings: February 21, 2017

2 Take-Home

3

4 Login to SSC using Selenium

5 if __name__ == "__main__": vipID=input("Enter your VIP ID:") password= getpass.getpass('Enter your Password:') driver = init_driver() login(driver, "Selenium") time.sleep(5) driver.quit()

6 Open chromedriver def init_driver(): driver = webdriver.Chrome("E:/chromedriver_win32/chromedriver.exe") driver.wait = WebDriverWait(driver, 5) return driver

7 def login(driver, query): driver. get("https://my. sc
def login(driver, query): driver.get(" print ("MySC opened") try: link = driver.wait.until(EC.presence_of_element_located( (By.PARTIAL_LINK_TEXT, "Sign in to"))) # g=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu print ("Found link", link) link.click() #button = driver.wait.until(EC.element_to_be_clickable( # (By.NAME, "btnK"))) #box.send_keys(query) #button.click() except TimeoutException: print("we have a problem First Page")

8 Login Page try: user_box = driver.wait.until(EC.presence_of_element_located( (By.NAME, "username"))) user_box.send_keys(vipID) passwd_box = driver.wait.until(EC.presence_of_element_located( (By.ID, "vipid-password"))) passwd_box.send_keys(password) button = driver.wait.until(EC.element_to_be_clickable( (By.NAME, "submit"))) print ("Found submit button", button) #box.send_keys(query) button.click() print ("Signed in successfully") except TimeoutException: print("we have a problem Login Page")

9 Faculty Page try: print ("Signed in successfully-- Main Menu Page") facMainMenuBTN = driver.wait.until(EC.presence_of_element_located( (By.ID, "bmenu--P_FacMainMnu___UID3"))) facMainMenuBTN.click() except TimeoutException: print("we have a problem Main Page") print ("Made it to the Faculty Page") link = driver.wait.until(EC.presence_of_element_located( (By.ID, "bwskfcls--p_sel_crse_search___UID6"))) # ?pkg=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu link.click() print("we have a problem Faculty Page")

10 try: select button = driver.find_element_by_id("id____UID7") # "value="Submit" button.click() advSearchButton= driver.find_element_by_id("id____UID6") # "value="Advanced Search" advSearchButton.click() select = ue='COL']").click() select = e='MATH']").click() sectionSearchButton= driver.find_element_by_id("id____UID5") # "value="Advanced Search" page = sectionSearchButton.click() except TimeoutException: print("we have a problem Faculty Page")

11 try: sections = driver. wait. until(EC
try: sections = driver.wait.until(EC.presence_of_element_located( (By.CLASS_NAME, "datadisplaytable"))) print ("Sections=", sections) html_page = driver.page_source soup = BeautifulSoup(html_page, 'html.parser') except TimeoutException: print("we have a problem Section Search Results Page")

12 # Now that we have the page as "soup" let's generate a csv file from it text = soup.get_text() outfile = open('workfile.html', 'w') prettypage = soup.prettify() outfile.write(prettypage) outfile.close(outfile)

13 with open('listing. csv', 'wb') as f: writer = csv
with open('listing.csv', 'wb') as f: writer = csv.writer(f) for tr in soup.find_all('tr')[2:]: tds = tr.find_all('td') row = [elem.text.encode('utf-8') for elem in tds] writer.writerow(row) return

14 Cleaning your data – Chapter 7

15 1-2grams.py from urllib.request import urlopen from bs4 import BeautifulSoup def getNgrams(input, n): input = input.split(' ') output = [] for i in range(len(input)-n+1): output.append(input[i:i+n]) return output

16 html = urlopen("http://en. wikipedia
html = urlopen(" bsObj = BeautifulSoup(html, "html.parser") content = bsObj.find("div", {"id":"mw-content-text"}).get_text() ngrams = getNgrams(content, 2) print(ngrams) print("2-grams count is: "+str(len(ngrams)))

17 -2-clean2grams.py from urllib.request import urlopen from bs4 import BeautifulSoup import re import string from collections import OrderedDict def cleanInput(input): pass def getNgrams(input, n)

18 def cleanInput(input): input = re. sub('\n+', " ", input) input = re
def cleanInput(input): input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input, "UTF-8") input = input.decode("ascii", "ignore") cleanInput = [] input = input.split(' ') for item in input: item = item.strip(string.punctuation) if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): cleanInput.append(item) return cleanInput

19 def getNgrams(input, n): input = cleanInput(input) output = dict() for i in range(len(input)-n+1): newNGram = " ".join(input[i:i+n]) if newNGram in output: output[newNGram] += 1 else: output[newNGram] = 1 return output

20 html = urlopen("http://en. wikipedia
html = urlopen(" bsObj = BeautifulSoup(html, "html.parser") content = bsObj.find("div", {"id":"mw-content-text"}).get_text() #ngrams = getNgrams(content, 2) #print(ngrams) #print("2-grams count is: "+str(len(ngrams))) ngrams = getNgrams(content, 2) ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) print(ngrams)

21

22

23

24


Download ppt "CSCE 590 Web Scraping - Selenium"

Similar presentations


Ads by Google