Web Scraping
I web scraped the website bdodae.com for data to use for my project. Since I am optimizing profits I need item values and information about the nodes.
Tools of choice
- Python because I want to learn another language for data analysis (having used R throughout college)
- Selenium because bdodae.com is written in JavaScript as opposed to HTML. This means that the website is dynamic and has to be loaded before web scraping.
- BeautifulSoup to parse the webpage once it is loaded in Selenium.
- MySQLdb because I want to save the scraped information to a MySQL database that I can pull information from later.
Scraping item values
# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import time
# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
user="yourusername",
passwd="yourpassword",
db="bdodae")
x = conn.cursor()
# Create table for prices
try:
x.execute(
"""
CREATE TABLE IF NOT EXISTS PRICES (
ITEM_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
ITEM CHAR(50) NOT NULL,
USER_AVERAGE INT,
RECENT_VALUE INT,
VENDOR_SELL INT,
VENDOR_BUY INT DEFAULT NULL)
""")
conn.commit()
except:
conn.rollback()
# Scraping item values
url = "https://www.bdodae.com/nodes/index.php?page=items"
# Create a new Firefox session and go to the URL above
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)
# Not mandatory
#driver.maximize_window()
# Initial page scrape
soup_values = BeautifulSoup(driver.page_source, 'lxml')
# Getting item names
items_soup = soup_values.find_all('a', class_ = "item_popup")
# Initialize empty list we are going to append item names to
items = []
for item in items_soup:
item_name = item.get_text()
items.append(item_name)
# Define a function to clean the text/values
def clean_values(values):
for i in range(len(values)):
values[i] = re.search("(\d+)", values[i].get_text()).group(0)
if (len(values) == 3):
# Add NULL to vendor_buy if it does not have one
values.insert(2, None)
# Find the value buttons to click on
value_buttons = driver.find_elements_by_class_name('value_button')
wait = WebDriverWait(driver, 10)
# Big loop that:
# 1.) Clicks on the values boxes to open up the hidden table
# 2.) Scrapes the page, including the now revealed hidden table
# 3.) Finds the values in the hidden table and cleans it
# 4.) Writes the item name and corresponding values into a MySQL database
# 5.) Waits until the hidden table is actually visible before we click off of it to close it
# 6.) Repeats steps 1-5 for all items/rows on the page
for item, button, i in zip(items, value_buttons, range(len(items))):
# Click on value box to open hidden table
button.click()
# If it clicks too fast, it'll throw an error since the table will cover other values
time.sleep(1)
# Now we scrape the hidden table
soup_values = BeautifulSoup(driver.page_source, 'lxml')
values = soup_values.find_all('div', class_ = "value_extra value_extra_on")[-1].find_all('div', class_ = "value_option")
# Clean the values output
clean_values(values)
values_list = [item] + values
# Safety check for when our code stops or does not find the values
if (len(values_list) != 5):
print("ERROR missing values")
print(values_list)
break
# Input data into database
try:
x.execute(
"""
INSERT INTO prices (item, user_average, recent_value, vendor_buy, vendor_sell)
VALUES (%s, %s, %s, %s, %s)
""", values_list)
conn.commit()
except:
conn.rollback()
# Wait until hidden table pops up
# If one of the values in the table is a duplicate, then the CSS selector has a 3 instead of 2 at the end
# Weird, but it's just how it is
if (len(set(values_list)) == 3 or len(set(values_list)) == 4):
element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(3)")))
else:
element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(2)")))
# Click off the values box to close the hidden table
off_button = driver.find_element_by_css_selector("tr.search:nth-child(" + str(i+2) + ") > td:nth-child(3)")
off_button.click()
# As of 10/24/2018 reset_actions() does NOT clear locally in Firefox
# So there will be lots of extra random clicking if you run this in Firefox
#action.move_to_element_with_offset(button, -5, 0).click().perform()
#action.reset_actions()
time.sleep(1)
pd.options.display.max_rows = 6
# Table 'prices' that we obtained from web scraping bdodae.com
prices = pd.read_sql('SELECT * FROM prices', con = conn)
prices
ITEM_ID | ITEM | USER_AVERAGE | RECENT_VALUE | VENDOR_SELL | VENDOR_BUY | |
---|---|---|---|---|---|---|
0 | 1 | Acacia Sap | 387 | 662 | 70 | NaN |
1 | 2 | Acacia Timber | 874 | 907 | 105 | NaN |
2 | 3 | Aloe | 212 | 174 | 40 | NaN |
... | ... | ... | ... | ... | ... | ... |
198 | 199 | White Umbrella Mushroom | 273 | 531 | 46 | NaN |
199 | 200 | Withered Leaf | 110 | 110 | 110 | NaN |
200 | 201 | Zinc Ore | 1016 | 1058 | 90 | NaN |
201 rows × 6 columns
Scraping node data
# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from itertools import chain
import re
import pandas as pd
import os
import time
# Launch URL
url = "https://www.bdodae.com/"
# Create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)
python_button = driver.find_element_by_id("nodes_nav")
python_button.click() # Click node link
# Storing page source in a variable
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')
# Grab all page links to nodes
links = soup_level1.find_all('a', {'href' : re.compile("^index\.php\?node"), 'class' : False})
# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
user="yourusername",
passwd="yourpassword",
db="bdodae")
x = conn.cursor()
# Create table for nodes
try:
x.execute(
"""
CREATE TABLE IF NOT EXISTS NODE (
NODE_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
NAME CHAR(50) NOT NULL,
CP INT DEFAULT 0,
AREA CHAR(50),
TYPE CHAR(50),
REGION_MOD_PERCENT FLOAT DEFAULT NULL,
CONNECTIONS CHAR(255))
""")
conn.commit()
except:
conn.rollback()
# Create table for subnodes
try:
x.execute(
"""
CREATE TABLE IF NOT EXISTS SUBNODE (
SUBNODE_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
NAME CHAR(50) NOT NULL,
CP INT DEFAULT 0,
BASE_WORKLOAD INT DEFAULT NULL,
CURRENT_WORKLOAD INT DEFAULT NULL,
ITEM1 CHAR(50) DEFAULT NULL,
ITEM2 CHAR(50) DEFAULT NULL,
ITEM3 CHAR(50) DEFAULT NULL,
ITEM4 CHAR(50) DEFAULT NULL,
YIELD1 FLOAT DEFAULT NULL,
YIELD2 FLOAT DEFAULT NULL,
YIELD3 FLOAT DEFAULT NULL,
YIELD4 FLOAT DEFAULT NULL,
DISTANCE1 INT DEFAULT 0,
DISTANCE2 INT DEFAULT NULL,
DISTANCE3 INT DEFAULT NULL,
CITY1 CHAR(50),
CITY2 CHAR(50) DEFAULT NULL,
CITY3 CHAR(50) DEFAULT NULL)
""")
conn.commit()
except:
conn.rollback()
# Go through every node page
for link in links:
# Get node link
link = link.get_text()
# Click on link
python_button = driver.find_element_by_link_text(link)
python_button.click()
# soup_level 2 contains the page source for the node clicked on
soup_level2 = BeautifulSoup(driver.page_source, 'lxml')
# General node info
node_title = soup_level2.find(class_ = 'n_title_link').get_text()
node_name = re.split('\s(?=\dCP)', node_title)[0]
node_cp = re.split('\s(?=\dCP)', node_title)[1][0]
node_area = soup_level2.find(class_ = 'n_area').get_text()
node_type = soup_level2.find(class_ = 'n_type').get_text()
node_connections = re.split('Connected: ', soup_level2.find(class_ = 'n_connected').get_text())[1]
# Check if region modifier exists
if (soup_level2.find(class_ = 'n_region')):
node_region_mod = re.search('\d', soup_level2.find(class_ = 'n_region').get_text()).group(0)
else:
node_region_mod = None
# Combine info into one variable
node_list = [node_name,
node_cp,
node_area,
node_type,
node_region_mod,
node_connections]
# Insert into MySQL database 'node'
try:
x.execute(
"""
INSERT INTO node (name, cp, area, type, region_mod_percent, connections)
VALUES (%s, %s, %s, %s, %s, %s)
""", node_list)
conn.commit()
except:
conn.rollback()
# Get subnode information if it exists
# Otherwise this part does nothing
node_subnode_subtype = soup_level2.find_all(class_ = 's_subtype')
node_subnode_workload = soup_level2.find_all(class_ = 's_workload')
node_subnode_tables = soup_level2.find_all('table', class_ = re.compile("^(?!worker_select_)"))
# Check if there are subnode tables
if node_subnode_tables:
node_subnode_tables = pd.read_html(str(node_subnode_tables[1:]), header = 0)
node_subnode_name = []
node_subnode_cp = []
base_workload = []
current_workload = []
for text, i in zip(node_subnode_subtype, range(len(node_subnode_subtype))):
node_subnode_name = [node_name + " - " + re.split('\s(?=\dCP)', text.get_text())[0] + "." + str(i+1)]
node_subnode_cp = [re.split('\s(?=\dCP)', text.get_text())[1][0]]
# Workload information should come in pairs: base and current workload
if (len(node_subnode_workload) % 2) == 0:
base_workload = [re.search('\d+', node_subnode_workload[(2*i)-1].get_text()).group(0)]
current_workload = [re.search('\d+', node_subnode_workload[2*i].get_text()).group(0)]
# Subnode tables should come in pairs: one containing items and yields, other containing distances and cities
# Also add None until list is the correct length
if (len(node_subnode_tables) % 2) == 0:
node_subnode_item = node_subnode_tables[2*i]["Item"]
node_subnode_item_list = []
for item in node_subnode_item:
node_subnode_item_list.append(item)
while len(node_subnode_item_list) < 4:
node_subnode_item_list.append(None)
node_subnode_yield = node_subnode_tables[2*i]["Avg"]
node_subnode_yield_list = []
for yields in node_subnode_yield:
node_subnode_yield_list.append(yields)
while len(node_subnode_yield_list) < 4:
node_subnode_yield_list.append(None)
node_distance = node_subnode_tables[(2*i)+1]["Range"]
node_distance_list = []
for distance in node_distance:
node_distance_list.append(distance)
while len(node_distance_list) < 3:
node_distance_list.append(None)
node_city = node_subnode_tables[(2*i)+1]["City"]
node_city_list = []
for city in node_city:
node_city_list.append(city)
while len(node_city_list) < 3:
node_city_list.append(None)
# Combine info into one variable
subnode_list = [node_subnode_name,
node_subnode_cp,
base_workload,
current_workload,
node_subnode_item_list,
node_subnode_yield_list,
node_distance_list,
node_city_list]
subnode_list = list(chain.from_iterable(subnode_list))
# Insert into MySQL database 'subnode'
try:
x.execute(
"""
INSERT INTO subnode (name, cp, base_workload, current_workload,
item1, item2, item3, item4, yield1, yield2, yield3, yield4,
distance1, distance2, distance3, city1, city2, city3)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", subnode_list)
conn.commit()
except:
conn.rollback()
# Back it up
driver.execute_script("window.history.go(-1)")
# Post-processing cleaning
x.execute(
"""
UPDATE node
SET connections = 'North Kaia Ferry, Catfishman Camp, Calpheon Castle'
WHERE node_id = 69;
"""
)
pd.options.display.max_rows = 6
# Table 'node' that we obtained from web scraping bdodae.com
node = pd.read_sql('SELECT * FROM node', con = conn)
node
NODE_ID | NAME | CP | AREA | TYPE | REGION_MOD_PERCENT | CONNECTIONS | |
---|---|---|---|---|---|---|---|
0 | 1 | Abandoned Iron Mine | 2 | Mediah | Worker Node | 0.0 | Abandoned Iron Mine Saunil District, Abandoned... |
1 | 2 | Abandoned Iron Mine Entrance | 1 | Mediah | Connection Node | NaN | Highland Junction, Alumn Rock Valley |
2 | 3 | Abandoned Iron Mine Rhutum District | 1 | Mediah | Connection Node | NaN | Abandoned Iron Mine, Abun |
... | ... | ... | ... | ... | ... | ... | ... |
356 | 357 | Yalt Canyon | 1 | Valencia | Connection Node | NaN | Shakatu, Gahaz Bandit's Lair |
357 | 358 | Yianaros's Field | 1 | Kamasylvia | Connection Node | NaN | Western Valtarra Mountains |
358 | 359 | Zagam Island | 1 | Margoria | Connection Node | NaN | Nada Island |
359 rows × 7 columns
# Table 'subnode' that we obtained from web scraping bdodae.com
subnode = pd.read_sql('SELECT * FROM subnode', con = conn)
subnode
SUBNODE_ID | NAME | CP | BASE_WORKLOAD | CURRENT_WORKLOAD | ITEM1 | ITEM2 | ITEM3 | ITEM4 | YIELD1 | YIELD2 | YIELD3 | YIELD4 | DISTANCE1 | DISTANCE2 | DISTANCE3 | CITY1 | CITY2 | CITY3 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Abandoned Iron Mine - Mining.1 | 3 | 200 | 250 | Zinc Ore | Powder Of Time | Platinum Ore | None | 7.86 | 1.86 | 0.95 | NaN | 1113 | 2063 | NaN | Altinova | Tarif | None |
1 | 2 | Abandoned Iron Mine - Mining.2 | 3 | 500 | 100 | Iron Ore | Powder Of Darkness | Rough Black Crystal | None | 10.53 | 1.80 | 1.08 | NaN | 1113 | 2063 | NaN | Altinova | Tarif | None |
2 | 3 | Ahto Farm - Farming.1 | 2 | 200 | 100 | Cotton | Cotton Yarn | None | None | 12.53 | 0.85 | NaN | NaN | 702 | 2000 | 2325.0 | Tarif | Heidel | Altinova |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
169 | 170 | Weenie Cabin - Gathering.1 | 2 | 140 | 140 | Volcanic Umbrella Mushroom | Green Pendulous Mushroom | None | None | 7.70 | 6.90 | NaN | NaN | 1608 | 3025 | 6005.0 | Grana | Old Wisdom Tree | Trent |
170 | 171 | Weita Island - Fish Drying Yard.1 | 1 | 2400 | 1200 | Dried Surfperch | Dried Bluefish | Dried Maomao | Dried Nibbler | 3.60 | 1.80 | 0.90 | 0.08 | 1620 | 3291 | 3340.0 | Iliya Island | Velia | Olvia |
171 | 172 | Wolf Hills - Lumbering.1 | 1 | 150 | 100 | Ash Timber | None | None | None | 4.43 | NaN | NaN | NaN | 340 | 1902 | NaN | Olvia | Velia | None |
172 rows × 19 columns