space scraper scripts
parent
34a2b1132c
commit
e044608eaf
|
@ -0,0 +1,189 @@
|
||||||
|
import json
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
from selenium.webdriver.support.ui import Select
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
import time
|
||||||
|
|
||||||
|
json_data = {}
|
||||||
|
old = new = ""
|
||||||
|
|
||||||
|
def visitCategories(filename):
|
||||||
|
visitBlackList = ['00', '01']
|
||||||
|
|
||||||
|
driver.find_element_by_xpath("//input[@id='LeftBar_contentplaceholder_control_chooseBldg_RadioButtonList_listby_1']").click()
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
options = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']")).options
|
||||||
|
for i in range(0, len(options)):
|
||||||
|
options[i] = options[i].text
|
||||||
|
|
||||||
|
for i in range(1, len(options)):
|
||||||
|
currentCategory = options[i]
|
||||||
|
if currentCategory not in visitBlackList:
|
||||||
|
select = Select(driver.find_element_by_xpath("//select[@id='LeftBar_contentplaceholder_control_chooseBldg_dropdownlist_bldglist']"))
|
||||||
|
select.select_by_index(i)
|
||||||
|
categoryXPath = "//table[@id='LeftBar_contentplaceholder_control_chooseBldg_gridview_bldglist']/tbody/tr[2]/td[2]"
|
||||||
|
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, categoryXPath), options[i]))
|
||||||
|
visitBuildings()
|
||||||
|
|
||||||
|
with open(filename, 'w') as outfile:
|
||||||
|
json.dump(json_data, outfile)
|
||||||
|
|
||||||
|
|
||||||
|
def visitBuildings():
|
||||||
|
bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']")
|
||||||
|
for i in range(0, len(bldgLinks) - 2, 2):
|
||||||
|
bldgLinks = driver.find_elements_by_xpath("//td/a[@class='nav_bldgList']")
|
||||||
|
newBldgNumber = bldgLinks[i + 1].text
|
||||||
|
bldgLinks[i].click()
|
||||||
|
bldgXPath = "//span[@id='Body_panel_restricted_body_label_bldgId']"
|
||||||
|
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, bldgXPath), newBldgNumber))
|
||||||
|
getPDFs()
|
||||||
|
#json_data_new = getInfo()
|
||||||
|
#json_data_new.update(getDetails())
|
||||||
|
#bldgId = json_data_new['id']
|
||||||
|
#json_data.update({bldgId : json_data_new})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def getInfo():
|
||||||
|
json_data_new = {}
|
||||||
|
json_data_new.update({'name' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgname']").text})
|
||||||
|
json_data_new.update({'id' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_bldgId']").text})
|
||||||
|
json_data_new.update({'gsf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_gsf']").text})
|
||||||
|
json_data_new.update({'ocf' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_constyr']").text})
|
||||||
|
json_data_new.update({'abbrev' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_abbrev']").text})
|
||||||
|
json_data_new.update({'address' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address']").text.title()})
|
||||||
|
json_data_new.update({'city' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_city']").text.title()})
|
||||||
|
json_data_new.update({'state' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_state']").text})
|
||||||
|
json_data_new.update({'zipcode' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_zip']").text})
|
||||||
|
json_data_new.update({'status' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_status']").text})
|
||||||
|
json_data_new.update({'classif' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_class']").text})
|
||||||
|
json_data_new.update({'proptype' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_proptype']").text})
|
||||||
|
json_data_new.update({'comments' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_comments']").text.capitalize()})
|
||||||
|
try:
|
||||||
|
json_data_new.update({'address2' : driver.find_element_by_xpath("//span[@id='Body_panel_restricted_body_label_address2").text.title()})
|
||||||
|
except NoSuchElementException:
|
||||||
|
json_data_new.update({'address2' : ""})
|
||||||
|
|
||||||
|
return json_data_new
|
||||||
|
|
||||||
|
def getPDFs():
|
||||||
|
try:
|
||||||
|
pdfs = driver.find_elements_by_xpath("//div[@id='Body_panel_restricted_body_panel_floorplanbuttons']//a")
|
||||||
|
except NoSuchElementException:
|
||||||
|
pdfs = []
|
||||||
|
# TODO FOLDERS BASED ON BLDGID
|
||||||
|
for pdf in pdfs:
|
||||||
|
pdf.click()
|
||||||
|
|
||||||
|
def getDetails():
|
||||||
|
json_data_new = {}
|
||||||
|
try:
|
||||||
|
details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input")
|
||||||
|
detailsText = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//label")
|
||||||
|
except NoSuchElementException:
|
||||||
|
details = detailsText = []
|
||||||
|
for i in range(0, len(detailsText)):
|
||||||
|
detailsText[i] = detailsText[i].text
|
||||||
|
|
||||||
|
detailFuncList = [getRoomDetail, getBldgDetail, getDeptDetail, getUsageDetail, getSummaryDetail]
|
||||||
|
for i in range(0, len(details)):
|
||||||
|
details = driver.find_elements_by_xpath("//table[@id='Body_panel_restricted_body_radiobuttonlist_infotype']/tbody//input")
|
||||||
|
details[i].click()
|
||||||
|
detailsData = detailFuncList[i]()
|
||||||
|
json_data_new.update({detailsText[i] : detailsData})
|
||||||
|
|
||||||
|
return json_data_new
|
||||||
|
|
||||||
|
def getRoomDetail():
|
||||||
|
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_roomInfoTotals', 'Body_panel_restricted_body_gridview_roomInfo'], 'Room Info')
|
||||||
|
|
||||||
|
def getBldgDetail():
|
||||||
|
return getTableDetail('', ['Body_panel_restricted_body_gridview_bldgTotals'], 'Building Info')
|
||||||
|
|
||||||
|
def getDeptDetail():
|
||||||
|
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_deptTotals', 'Body_panel_restricted_body_gridview_deptInfo'], 'Department Info')
|
||||||
|
|
||||||
|
def getUsageDetail():
|
||||||
|
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_usageTotals', 'Body_panel_restricted_body_gridview_usageRooms'], 'Usage Info')
|
||||||
|
|
||||||
|
def getSummaryDetail():
|
||||||
|
return getTableDetail('Body_panel_restricted_body_dropdownlist_floor', ['Body_panel_restricted_body_gridview_stateguidelines', 'Body_panel_restricted_body_gridview_stateguidelinesAreas'], 'Summary')
|
||||||
|
|
||||||
|
def getTableDetail(selectID, tableIDs, initialCaption):
|
||||||
|
selectXPath = "//select[@id='{selectID}']" . format(selectID = selectID)
|
||||||
|
tableXPaths = []
|
||||||
|
for i in range(0, len(tableIDs)):
|
||||||
|
new_xpath = "//table[@id='{tableID}']" . format(tableID = tableIDs[i])
|
||||||
|
tableXPaths.append(new_xpath)
|
||||||
|
|
||||||
|
captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0])
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), initialCaption))
|
||||||
|
except TimeoutException:
|
||||||
|
e = 1
|
||||||
|
|
||||||
|
json_data_new = {}
|
||||||
|
|
||||||
|
if selectID == '':
|
||||||
|
for tableXPath in tableXPaths:
|
||||||
|
json_data_new.update(getTable(tableXPath))
|
||||||
|
|
||||||
|
try:
|
||||||
|
options = Select(driver.find_element_by_xpath(selectXPath)).options
|
||||||
|
except NoSuchElementException:
|
||||||
|
options = []
|
||||||
|
|
||||||
|
for i in range(0, len(options)):
|
||||||
|
options[i] = options[i].text
|
||||||
|
|
||||||
|
for i in range(0, len(options)):
|
||||||
|
select = Select(driver.find_element_by_xpath(selectXPath))
|
||||||
|
select.select_by_index(i)
|
||||||
|
captionXPath = "{tableXPath}/caption" . format(tableXPath = tableXPaths[0])
|
||||||
|
try:
|
||||||
|
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.XPATH, captionXPath), options[i]))
|
||||||
|
except TimeoutException:
|
||||||
|
e = 1
|
||||||
|
json_data_new.update({options[i] : {}})
|
||||||
|
for tableXPath in tableXPaths:
|
||||||
|
try:
|
||||||
|
json_data_new[options[i]].update(getTable(tableXPath))
|
||||||
|
except NoSuchElementException:
|
||||||
|
e = 1
|
||||||
|
|
||||||
|
return json_data_new
|
||||||
|
|
||||||
|
def getTable(tableID, caption, label):
|
||||||
|
json_data_new = {}
|
||||||
|
#caption = driver.find_element_by_xpath("{tableID}/caption" . format(tableID = tableID))
|
||||||
|
#headers = driver.find_elements_by_xpath("{tableID}/tbody/tr/th" . format(tableID = tableID))
|
||||||
|
headers = ["Specific Id", "Type", "Description"]
|
||||||
|
data = driver.find_elements_by_xpath("{tableID}/tbody/tr/td" . format(tableID = tableID))
|
||||||
|
|
||||||
|
#json_data_new = {caption.text : []}
|
||||||
|
json_data_new = {caption : []}
|
||||||
|
headerLen = len(headers)
|
||||||
|
for i in range(0, len(data), headerLen):
|
||||||
|
new_json = {}
|
||||||
|
for j in range(0, headerLen):
|
||||||
|
#key = headers[j].text
|
||||||
|
key = headers[j]
|
||||||
|
value = data[i + j].text
|
||||||
|
new_json[key] = value
|
||||||
|
#json_data_new[caption.text].append(new_json)
|
||||||
|
json_data_new[caption].append(new_json)
|
||||||
|
|
||||||
|
return json_data_new
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
driver = webdriver.Chrome()
|
||||||
|
driver.get("https://space.facilities.vt.edu/Lock/bldgAndRoom.aspx")
|
||||||
|
input("Press Enter After you've logged in...")
|
||||||
|
#visitCategories(driver)
|
Loading…
Reference in New Issue