From e044608eafcef44fb94349fb0c74b2d4c104fe1b Mon Sep 17 00:00:00 2001 From: Paul Walko Date: Fri, 1 Sep 2017 10:44:53 -0400 Subject: [PATCH] space scraper scripts --- space_scraper.py | 189 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100755 space_scraper.py diff --git a/space_scraper.py b/space_scraper.py new file mode 100755 index 0000000..3fb5f1a --- /dev/null +++ b/space_scraper.py @@ -0,0 +1,189 @@ +import json +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import Select +from selenium.webdriver.support.ui import WebDriverWait +from selenium.common.exceptions import NoSuchElementException +from selenium.common.exceptions import TimeoutException +import time + +json_data = {} +old = new = "" + +def visitCategories(filename): + visitBlackList = ['00', '01'] + + driver.find_element_by_xpath("//input[@id='LeftBar_contentplaceholder_control_chooseBldg_RadioButtonList_listby_1']").click() + time.sleep(2) + + options = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']")).options + for i in range(0, len(options)): + options[i] = options[i].text + + for i in range(1, len(options)): + currentCategory = options[i] + if currentCategory not in visitBlackList: + select = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']")) + select.select_by_index(i) + categoryXPath = "//table[@id='LeftBar_contentplaceholder_control_chooseBldg_gridview_bldglist']/tbody/tr[2]/td[2]" + WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, categoryXPath), options[i])) + visitBuildings() + + with open(filename, 'w') as outfile: + json.dump(json_data, outfile) + + +def visitBuildings(): + bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']") + for i in range(0, len(bldgLinks) - 2, 2): + bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']") + newBldgNumber = bldgLinks[i + 1].text + bldgLinks[i].click() + bldgXPath = "//span[@id='Body_panel_restricted_body_label_bldgId']" + WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, bldgXPath), newBldgNumber)) + getPDFs() + #json_data_new = getInfo() + #json_data_new.update(getDetails()) + #bldgId = json_data_new['id'] + #json_data.update({bldgId : json_data_new}) + + + +def getInfo(): + json_data_new = {} + json_data_new.update({'name' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgname']").text}) + json_data_new.update({'id' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgId']").text}) + json_data_new.update({'gsf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_gsf']").text}) + json_data_new.update({'ocf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_constyr']").text}) + json_data_new.update({'abbrev' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_abbrev']").text}) + json_data_new.update({'address' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address']").text.title()}) + json_data_new.update({'city' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_city']").text.title()}) + json_data_new.update({'state' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_state']").text}) + json_data_new.update({'zipcode' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_zip']").text}) + json_data_new.update({'status' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_status']").text}) + json_data_new.update({'classif' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_class']").text}) + json_data_new.update({'proptype' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_proptype']").text}) + json_data_new.update({'comments' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_comments']").text.capitalize()}) + try: + json_data_new.update({'address2' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address2").text.title()}) + except NoSuchElementException: + json_data_new.update({'address2' : ""}) + + return json_data_new + +def getPDFs(): + try: + pdfs = driver.find_elements_by_xpath("//div[@id='Body_panel_restricted_body_panel_floorplanbuttons']//a") + except NoSuchElementException: + pdfs = [] + # TODO FOLDERS BASED ON BLDGID + for pdf in pdfs: + pdf.click() + +def getDetails(): + json_data_new = {} + try: + details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input") + detailsText = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//label") + except NoSuchElementException: + details = detailsText = [] + for i in range(0, len(detailsText)): + detailsText[i] = detailsText[i].text + + detailFuncList = [getRoomDetail, getBldgDetail, getDeptDetail, getUsageDetail, getSummaryDetail] + for i in range(0, len(details)): + details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input") + details[i].click() + detailsData = detailFuncList[i]() + json_data_new.update({detailsText[i] : detailsData}) + + return json_data_new + +def getRoomDetail(): + return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_roomInfoTotals', 'Body_panel_restricted_body_gridview_roomInfo'], 'Room Info') + +def getBldgDetail(): + return getTableDetail('', ['Body_panel_restricted_body_gridview_bldgTotals'], 'Building Info') + +def getDeptDetail(): + return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_deptTotals', 'Body_panel_restricted_body_gridview_deptInfo'], 'Department Info') + +def getUsageDetail(): + return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_usageTotals', 'Body_panel_restricted_body_gridview_usageRooms'], 'Usage Info') + +def getSummaryDetail(): + return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_stateguidelines', 'Body_panel_restricted_body_gridview_stateguidelinesAreas'], 'Summary') + +def getTableDetail(selectID, tableIDs, initialCaption): + selectXPath = "//select[@id='{selectID}']" . format(selectID = selectID) + tableXPaths = [] + for i in range(0, len(tableIDs)): + new_xpath = "//table[@id='{tableID}']" . format(tableID = tableIDs[i]) + tableXPaths.append(new_xpath) + + captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0]) + try: + WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), initialCaption)) + except TimeoutException: + e = 1 + + json_data_new = {} + + if selectID == '': + for tableXPath in tableXPaths: + json_data_new.update(getTable(tableXPath)) + + try: + options = Select(driver.find_element_by_xpath(selectXPath)).options + except NoSuchElementException: + options = [] + + for i in range(0, len(options)): + options[i] = options[i].text + + for i in range(0, len(options)): + select = Select(driver.find_element_by_xpath(selectXPath)) + select.select_by_index(i) + captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0]) + try: + WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), options[i])) + except TimeoutException: + e = 1 + json_data_new.update({options[i] : {}}) + for tableXPath in tableXPaths: + try: + json_data_new[options[i]].update(getTable(tableXPath)) + except NoSuchElementException: + e = 1 + + return json_data_new + +def getTable(tableID, caption, label): + json_data_new = {} + #caption = driver.find_element_by_xpath("{tableID}/caption" . format(tableID = tableID)) + #headers = driver.find_elements_by_xpath("{tableID}/tbody/tr/th" . format(tableID = tableID)) + headers = ["Specific Id", "Type", "Description"] + data = driver.find_elements_by_xpath("{tableID}/tbody/tr/td" . format(tableID = tableID)) + + #json_data_new = {caption.text : []} + json_data_new = {caption : []} + headerLen = len(headers) + for i in range(0, len(data), headerLen): + new_json = {} + for j in range(0, headerLen): + #key = headers[j].text + key = headers[j] + value = data[i + j].text + new_json[key] = value + #json_data_new[caption.text].append(new_json) + json_data_new[caption].append(new_json) + + return json_data_new + + + +driver = webdriver.Chrome() +driver.get("https://space.facilities.vt.edu/Lock/bldgAndRoom.aspx") +input("Press Enter After you've logged in...") +#visitCategories(driver)