Download presentation
Presentation is loading. Please wait.
1
CSCE 590 Web Scraping - Selenium
Topics Login to SelfServiceCarolina example Readings: February 21, 2017
2
Take-Home
4
Login to SSC using Selenium
5
if __name__ == "__main__": vipID=input("Enter your VIP ID:") password= getpass.getpass('Enter your Password:') driver = init_driver() login(driver, "Selenium") time.sleep(5) driver.quit()
6
Open chromedriver def init_driver(): driver = webdriver.Chrome("E:/chromedriver_win32/chromedriver.exe") driver.wait = WebDriverWait(driver, 5) return driver
7
def login(driver, query): driver. get("https://my. sc
def login(driver, query): driver.get(" print ("MySC opened") try: link = driver.wait.until(EC.presence_of_element_located( (By.PARTIAL_LINK_TEXT, "Sign in to"))) # g=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu print ("Found link", link) link.click() #button = driver.wait.until(EC.element_to_be_clickable( # (By.NAME, "btnK"))) #box.send_keys(query) #button.click() except TimeoutException: print("we have a problem First Page")
8
Login Page try: user_box = driver.wait.until(EC.presence_of_element_located( (By.NAME, "username"))) user_box.send_keys(vipID) passwd_box = driver.wait.until(EC.presence_of_element_located( (By.ID, "vipid-password"))) passwd_box.send_keys(password) button = driver.wait.until(EC.element_to_be_clickable( (By.NAME, "submit"))) print ("Found submit button", button) #box.send_keys(query) button.click() print ("Signed in successfully") except TimeoutException: print("we have a problem Login Page")
9
Faculty Page try: print ("Signed in successfully-- Main Menu Page") facMainMenuBTN = driver.wait.until(EC.presence_of_element_located( (By.ID, "bmenu--P_FacMainMnu___UID3"))) facMainMenuBTN.click() except TimeoutException: print("we have a problem Main Page") print ("Made it to the Faculty Page") link = driver.wait.until(EC.presence_of_element_located( (By.ID, "bwskfcls--p_sel_crse_search___UID6"))) # ?pkg=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu link.click() print("we have a problem Faculty Page")
10
try: select button = driver.find_element_by_id("id____UID7") # "value="Submit" button.click() advSearchButton= driver.find_element_by_id("id____UID6") # "value="Advanced Search" advSearchButton.click() select = ue='COL']").click() select = e='MATH']").click() sectionSearchButton= driver.find_element_by_id("id____UID5") # "value="Advanced Search" page = sectionSearchButton.click() except TimeoutException: print("we have a problem Faculty Page")
11
try: sections = driver. wait. until(EC
try: sections = driver.wait.until(EC.presence_of_element_located( (By.CLASS_NAME, "datadisplaytable"))) print ("Sections=", sections) html_page = driver.page_source soup = BeautifulSoup(html_page, 'html.parser') except TimeoutException: print("we have a problem Section Search Results Page")
12
# Now that we have the page as "soup" let's generate a csv file from it text = soup.get_text() outfile = open('workfile.html', 'w') prettypage = soup.prettify() outfile.write(prettypage) outfile.close(outfile)
13
with open('listing. csv', 'wb') as f: writer = csv
with open('listing.csv', 'wb') as f: writer = csv.writer(f) for tr in soup.find_all('tr')[2:]: tds = tr.find_all('td') row = [elem.text.encode('utf-8') for elem in tds] writer.writerow(row) return
14
Cleaning your data – Chapter 7
15
1-2grams.py from urllib.request import urlopen from bs4 import BeautifulSoup def getNgrams(input, n): input = input.split(' ') output = [] for i in range(len(input)-n+1): output.append(input[i:i+n]) return output
16
html = urlopen("http://en. wikipedia
html = urlopen(" bsObj = BeautifulSoup(html, "html.parser") content = bsObj.find("div", {"id":"mw-content-text"}).get_text() ngrams = getNgrams(content, 2) print(ngrams) print("2-grams count is: "+str(len(ngrams)))
17
-2-clean2grams.py from urllib.request import urlopen from bs4 import BeautifulSoup import re import string from collections import OrderedDict def cleanInput(input): pass def getNgrams(input, n)
18
def cleanInput(input): input = re. sub('\n+', " ", input) input = re
def cleanInput(input): input = re.sub('\n+', " ", input) input = re.sub('\[[0-9]*\]', "", input) input = re.sub(' +', " ", input) input = bytes(input, "UTF-8") input = input.decode("ascii", "ignore") cleanInput = [] input = input.split(' ') for item in input: item = item.strip(string.punctuation) if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): cleanInput.append(item) return cleanInput
19
def getNgrams(input, n): input = cleanInput(input) output = dict() for i in range(len(input)-n+1): newNGram = " ".join(input[i:i+n]) if newNGram in output: output[newNGram] += 1 else: output[newNGram] = 1 return output
20
html = urlopen("http://en. wikipedia
html = urlopen(" bsObj = BeautifulSoup(html, "html.parser") content = bsObj.find("div", {"id":"mw-content-text"}).get_text() #ngrams = getNgrams(content, 2) #print(ngrams) #print("2-grams count is: "+str(len(ngrams))) ngrams = getNgrams(content, 2) ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) print(ngrams)
Similar presentations
© 2025 SlidePlayer.com. Inc.
All rights reserved.