space scraper scripts

master
Paul Walko 2017-09-01 10:44:53 -04:00
parent 34a2b1132c
commit e044608eaf
1 changed files with 189 additions and 0 deletions

189
space_scraper.py Executable file
View File

@ -0,0 +1,189 @@
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
import time
json_data = {}
old = new = ""
def visitCategories(filename):
visitBlackList = ['00', '01']
driver.find_element_by_xpath("//input[@id='LeftBar_contentplaceholder_control_chooseBldg_RadioButtonList_listby_1']").click()
time.sleep(2)
options = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']")).options
for i in range(0, len(options)):
options[i] = options[i].text
for i in range(1, len(options)):
currentCategory = options[i]
if currentCategory not in visitBlackList:
select = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']"))
select.select_by_index(i)
categoryXPath = "//table[@id='LeftBar_contentplaceholder_control_chooseBldg_gridview_bldglist']/tbody/tr[2]/td[2]"
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, categoryXPath), options[i]))
visitBuildings()
with open(filename, 'w') as outfile:
json.dump(json_data, outfile)
def visitBuildings():
bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']")
for i in range(0, len(bldgLinks) - 2, 2):
bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']")
newBldgNumber = bldgLinks[i + 1].text
bldgLinks[i].click()
bldgXPath = "//span[@id='Body_panel_restricted_body_label_bldgId']"
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, bldgXPath), newBldgNumber))
getPDFs()
#json_data_new = getInfo()
#json_data_new.update(getDetails())
#bldgId = json_data_new['id']
#json_data.update({bldgId : json_data_new})
def getInfo():
json_data_new = {}
json_data_new.update({'name' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgname']").text})
json_data_new.update({'id' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgId']").text})
json_data_new.update({'gsf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_gsf']").text})
json_data_new.update({'ocf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_constyr']").text})
json_data_new.update({'abbrev' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_abbrev']").text})
json_data_new.update({'address' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address']").text.title()})
json_data_new.update({'city' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_city']").text.title()})
json_data_new.update({'state' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_state']").text})
json_data_new.update({'zipcode' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_zip']").text})
json_data_new.update({'status' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_status']").text})
json_data_new.update({'classif' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_class']").text})
json_data_new.update({'proptype' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_proptype']").text})
json_data_new.update({'comments' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_comments']").text.capitalize()})
try:
json_data_new.update({'address2' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address2").text.title()})
except NoSuchElementException:
json_data_new.update({'address2' : ""})
return json_data_new
def getPDFs():
try:
pdfs = driver.find_elements_by_xpath("//div[@id='Body_panel_restricted_body_panel_floorplanbuttons']//a")
except NoSuchElementException:
pdfs = []
# TODO FOLDERS BASED ON BLDGID
for pdf in pdfs:
pdf.click()
def getDetails():
json_data_new = {}
try:
details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input")
detailsText = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//label")
except NoSuchElementException:
details = detailsText = []
for i in range(0, len(detailsText)):
detailsText[i] = detailsText[i].text
detailFuncList = [getRoomDetail, getBldgDetail, getDeptDetail, getUsageDetail, getSummaryDetail]
for i in range(0, len(details)):
details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input")
details[i].click()
detailsData = detailFuncList[i]()
json_data_new.update({detailsText[i] : detailsData})
return json_data_new
def getRoomDetail():
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_roomInfoTotals', 'Body_panel_restricted_body_gridview_roomInfo'], 'Room Info')
def getBldgDetail():
return getTableDetail('', ['Body_panel_restricted_body_gridview_bldgTotals'], 'Building Info')
def getDeptDetail():
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_deptTotals', 'Body_panel_restricted_body_gridview_deptInfo'], 'Department Info')
def getUsageDetail():
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_usageTotals', 'Body_panel_restricted_body_gridview_usageRooms'], 'Usage Info')
def getSummaryDetail():
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_stateguidelines', 'Body_panel_restricted_body_gridview_stateguidelinesAreas'], 'Summary')
def getTableDetail(selectID, tableIDs, initialCaption):
selectXPath = "//select[@id='{selectID}']" . format(selectID = selectID)
tableXPaths = []
for i in range(0, len(tableIDs)):
new_xpath = "//table[@id='{tableID}']" . format(tableID = tableIDs[i])
tableXPaths.append(new_xpath)
captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0])
try:
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), initialCaption))
except TimeoutException:
e = 1
json_data_new = {}
if selectID == '':
for tableXPath in tableXPaths:
json_data_new.update(getTable(tableXPath))
try:
options = Select(driver.find_element_by_xpath(selectXPath)).options
except NoSuchElementException:
options = []
for i in range(0, len(options)):
options[i] = options[i].text
for i in range(0, len(options)):
select = Select(driver.find_element_by_xpath(selectXPath))
select.select_by_index(i)
captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0])
try:
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), options[i]))
except TimeoutException:
e = 1
json_data_new.update({options[i] : {}})
for tableXPath in tableXPaths:
try:
json_data_new[options[i]].update(getTable(tableXPath))
except NoSuchElementException:
e = 1
return json_data_new
def getTable(tableID, caption, label):
json_data_new = {}
#caption = driver.find_element_by_xpath("{tableID}/caption" . format(tableID = tableID))
#headers = driver.find_elements_by_xpath("{tableID}/tbody/tr/th" . format(tableID = tableID))
headers = ["Specific Id", "Type", "Description"]
data = driver.find_elements_by_xpath("{tableID}/tbody/tr/td" . format(tableID = tableID))
#json_data_new = {caption.text : []}
json_data_new = {caption : []}
headerLen = len(headers)
for i in range(0, len(data), headerLen):
new_json = {}
for j in range(0, headerLen):
#key = headers[j].text
key = headers[j]
value = data[i + j].text
new_json[key] = value
#json_data_new[caption.text].append(new_json)
json_data_new[caption].append(new_json)
return json_data_new
driver = webdriver.Chrome()
driver.get("https://space.facilities.vt.edu/Lock/bldgAndRoom.aspx")
input("Press Enter After you've logged in...")
#visitCategories(driver)