Web Scraping

I web scraped the website bdodae.com for data to use for my project. Since I am optimizing profits I need item values and information about the nodes.

Tools of choice

Python because I want to learn another language for data analysis (having used R throughout college)
Selenium because bdodae.com is written in JavaScript as opposed to HTML. This means that the website is dynamic and has to be loaded before web scraping.
BeautifulSoup to parse the webpage once it is loaded in Selenium.
MySQLdb because I want to save the scraped information to a MySQL database that I can pull information from later.

Scraping item values

Item page

# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import time

# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
                  user="yourusername",
                  passwd="yourpassword",
                  db="bdodae")

x = conn.cursor()

# Create table for prices
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS PRICES (
    ITEM_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    ITEM CHAR(50) NOT NULL,
    USER_AVERAGE INT,
    RECENT_VALUE INT,
    VENDOR_SELL INT,
    VENDOR_BUY INT DEFAULT NULL)
    """)
    conn.commit()
except:
    conn.rollback()


# Scraping item values
url = "https://www.bdodae.com/nodes/index.php?page=items"

# Create a new Firefox session and go to the URL above
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)

# Not mandatory
#driver.maximize_window()

# Initial page scrape
soup_values = BeautifulSoup(driver.page_source, 'lxml')

# Getting item names
items_soup = soup_values.find_all('a', class_ = "item_popup")

# Initialize empty list we are going to append item names to
items = []
for item in items_soup:
    item_name = item.get_text()
    items.append(item_name)

# Define a function to clean the text/values
def clean_values(values):
    for i in range(len(values)):
        values[i] = re.search("(\d+)", values[i].get_text()).group(0)
    if (len(values) == 3):
        # Add NULL to vendor_buy if it does not have one
        values.insert(2, None)
    
# Find the value buttons to click on
value_buttons = driver.find_elements_by_class_name('value_button')
wait = WebDriverWait(driver, 10)

# Big loop that:
# 1.) Clicks on the values boxes to open up the hidden table
# 2.) Scrapes the page, including the now revealed hidden table
# 3.) Finds the values in the hidden table and cleans it
# 4.) Writes the item name and corresponding values into a MySQL database
# 5.) Waits until the hidden table is actually visible before we click off of it to close it
# 6.) Repeats steps 1-5 for all items/rows on the page
for item, button, i in zip(items, value_buttons, range(len(items))):
    # Click on value box to open hidden table
    button.click()
    # If it clicks too fast, it'll throw an error since the table will cover other values
    time.sleep(1)
    # Now we scrape the hidden table
    soup_values = BeautifulSoup(driver.page_source, 'lxml')
    values = soup_values.find_all('div', class_ = "value_extra value_extra_on")[-1].find_all('div', class_ = "value_option")
    # Clean the values output
    clean_values(values)
    values_list = [item] + values
    # Safety check for when our code stops or does not find the values
    if (len(values_list) != 5): 
        print("ERROR missing values") 
        print(values_list)
        break
    # Input data into database
    try:
        x.execute(
        """
        INSERT INTO prices (item, user_average, recent_value, vendor_buy, vendor_sell)
        VALUES (%s, %s, %s, %s, %s)
        """, values_list)
        conn.commit()
    except:
        conn.rollback()
    # Wait until hidden table pops up
    # If one of the values in the table is a duplicate, then the CSS selector has a 3 instead of 2 at the end
    # Weird, but it's just how it is
    if (len(set(values_list)) == 3 or len(set(values_list)) == 4):
        element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(3)")))
    else:
        element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(2)")))
    # Click off the values box to close the hidden table
    off_button = driver.find_element_by_css_selector("tr.search:nth-child(" + str(i+2) + ") > td:nth-child(3)")
    off_button.click()
    # As of 10/24/2018 reset_actions() does NOT clear locally in Firefox
    # So there will be lots of extra random clicking if you run this in Firefox
    #action.move_to_element_with_offset(button, -5, 0).click().perform()
    #action.reset_actions()
    time.sleep(1)

pd.options.display.max_rows = 6
# Table 'prices' that we obtained from web scraping bdodae.com
prices = pd.read_sql('SELECT * FROM prices', con = conn)
prices

	ITEM_ID	ITEM	USER_AVERAGE	RECENT_VALUE	VENDOR_SELL	VENDOR_BUY
0	1	Acacia Sap	387	662	70	NaN
1	2	Acacia Timber	874	907	105	NaN
2	3	Aloe	212	174	40	NaN
...	...	...	...	...	...	...
198	199	White Umbrella Mushroom	273	531	46	NaN
199	200	Withered Leaf	110	110	110	NaN
200	201	Zinc Ore	1016	1058	90	NaN

201 rows × 6 columns

Scraping node data

Node page

# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from itertools import chain
import re
import pandas as pd
import os
import time

# Launch URL
url = "https://www.bdodae.com/"

# Create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)

python_button = driver.find_element_by_id("nodes_nav")
python_button.click() # Click node link

# Storing page source in a variable
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')

# Grab all page links to nodes
links = soup_level1.find_all('a', {'href' : re.compile("^index\.php\?node"), 'class' : False})

# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
                  user="yourusername",
                  passwd="yourpassword",
                  db="bdodae")

x = conn.cursor()

# Create table for nodes
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS NODE (
    NODE_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    NAME CHAR(50) NOT NULL,
    CP INT DEFAULT 0,
    AREA CHAR(50),
    TYPE CHAR(50),
    REGION_MOD_PERCENT FLOAT DEFAULT NULL,
    CONNECTIONS CHAR(255))
    """)
    conn.commit()
except:
    conn.rollback()

# Create table for subnodes
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS SUBNODE (
    SUBNODE_ID INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    NAME CHAR(50) NOT NULL,
    CP INT DEFAULT 0,
    BASE_WORKLOAD INT DEFAULT NULL,
    CURRENT_WORKLOAD INT DEFAULT NULL,
    ITEM1 CHAR(50) DEFAULT NULL,
    ITEM2 CHAR(50) DEFAULT NULL,
    ITEM3 CHAR(50) DEFAULT NULL,
    ITEM4 CHAR(50) DEFAULT NULL,
    YIELD1 FLOAT DEFAULT NULL,
    YIELD2 FLOAT DEFAULT NULL,
    YIELD3 FLOAT DEFAULT NULL,
    YIELD4 FLOAT DEFAULT NULL,
    DISTANCE1 INT DEFAULT 0,
    DISTANCE2 INT DEFAULT NULL,
    DISTANCE3 INT DEFAULT NULL,
    CITY1 CHAR(50),
    CITY2 CHAR(50) DEFAULT NULL,
    CITY3 CHAR(50) DEFAULT NULL)
    """)
    conn.commit()
except:
    conn.rollback()

# Go through every node page
for link in links:
    # Get node link
    link = link.get_text()
    
    # Click on link
    python_button = driver.find_element_by_link_text(link)
    python_button.click()
    
    # soup_level 2 contains the page source for the node clicked on
    soup_level2 = BeautifulSoup(driver.page_source, 'lxml')
    
    # General node info
    node_title = soup_level2.find(class_ = 'n_title_link').get_text()
    node_name = re.split('\s(?=\dCP)', node_title)[0]
    node_cp = re.split('\s(?=\dCP)', node_title)[1][0]
    node_area = soup_level2.find(class_ = 'n_area').get_text()
    node_type = soup_level2.find(class_ = 'n_type').get_text()
    node_connections = re.split('Connected: ', soup_level2.find(class_ = 'n_connected').get_text())[1]
    # Check if region modifier exists
    if (soup_level2.find(class_ = 'n_region')):
        node_region_mod = re.search('\d', soup_level2.find(class_ = 'n_region').get_text()).group(0)
    else:
        node_region_mod = None

    # Combine info into one variable
    node_list = [node_name,
                 node_cp,
                 node_area,
                 node_type,
                 node_region_mod,
                 node_connections]

    # Insert into MySQL database 'node'
    try:
        x.execute(
        """
        INSERT INTO node (name, cp, area, type, region_mod_percent, connections)
        VALUES (%s, %s, %s, %s, %s, %s)
        """, node_list)
        conn.commit()
    except:
        conn.rollback()
        
    
    # Get subnode information if it exists
    # Otherwise this part does nothing
    node_subnode_subtype = soup_level2.find_all(class_ = 's_subtype')
    node_subnode_workload = soup_level2.find_all(class_ = 's_workload')
    node_subnode_tables = soup_level2.find_all('table', class_ = re.compile("^(?!worker_select_)"))
    # Check if there are subnode tables
    if node_subnode_tables:
        node_subnode_tables = pd.read_html(str(node_subnode_tables[1:]), header = 0)

    node_subnode_name = []
    node_subnode_cp = []
    base_workload = []
    current_workload = []
    for text, i in zip(node_subnode_subtype, range(len(node_subnode_subtype))):
        node_subnode_name = [node_name + " - " + re.split('\s(?=\dCP)', text.get_text())[0] + "." + str(i+1)]
        node_subnode_cp = [re.split('\s(?=\dCP)', text.get_text())[1][0]]
        # Workload information should come in pairs: base and current workload
        if (len(node_subnode_workload) % 2) == 0:
            base_workload = [re.search('\d+', node_subnode_workload[(2*i)-1].get_text()).group(0)]
            current_workload = [re.search('\d+', node_subnode_workload[2*i].get_text()).group(0)]
        # Subnode tables should come in pairs: one containing items and yields, other containing distances and cities
        # Also add None until list is the correct length
        if (len(node_subnode_tables) % 2) == 0:
            node_subnode_item = node_subnode_tables[2*i]["Item"]
            node_subnode_item_list = []
            for item in node_subnode_item:
                node_subnode_item_list.append(item)
            while len(node_subnode_item_list) < 4:
                node_subnode_item_list.append(None)
            node_subnode_yield = node_subnode_tables[2*i]["Avg"]
            node_subnode_yield_list = []
            for yields in node_subnode_yield:
                node_subnode_yield_list.append(yields)
            while len(node_subnode_yield_list) < 4:
                node_subnode_yield_list.append(None)
            node_distance = node_subnode_tables[(2*i)+1]["Range"]
            node_distance_list = []
            for distance in node_distance:
                node_distance_list.append(distance)
            while len(node_distance_list) < 3:
                node_distance_list.append(None)
            node_city = node_subnode_tables[(2*i)+1]["City"]
            node_city_list = []
            for city in node_city:
                node_city_list.append(city)
            while len(node_city_list) < 3:
                node_city_list.append(None)

            # Combine info into one variable
            subnode_list = [node_subnode_name,
                            node_subnode_cp,
                            base_workload,
                            current_workload,
                            node_subnode_item_list,
                            node_subnode_yield_list,
                            node_distance_list,
                            node_city_list]
            subnode_list = list(chain.from_iterable(subnode_list))

            # Insert into MySQL database 'subnode'
            try:
                x.execute(
                """
                INSERT INTO subnode (name, cp, base_workload, current_workload,
                                  item1, item2, item3, item4, yield1, yield2, yield3, yield4,
                                  distance1, distance2, distance3, city1, city2, city3)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                """, subnode_list)
                conn.commit()
            except:
                conn.rollback()

                
        
    # Back it up
    driver.execute_script("window.history.go(-1)")

# Post-processing cleaning
x.execute(
"""
UPDATE node
    SET connections = 'North Kaia Ferry, Catfishman Camp, Calpheon Castle'
    WHERE node_id = 69;
"""
)

pd.options.display.max_rows = 6
# Table 'node' that we obtained from web scraping bdodae.com
node = pd.read_sql('SELECT * FROM node', con = conn)
node

	NODE_ID	NAME	CP	AREA	TYPE	REGION_MOD_PERCENT	CONNECTIONS
0	1	Abandoned Iron Mine	2	Mediah	Worker Node	0.0	Abandoned Iron Mine Saunil District, Abandoned...
1	2	Abandoned Iron Mine Entrance	1	Mediah	Connection Node	NaN	Highland Junction, Alumn Rock Valley
2	3	Abandoned Iron Mine Rhutum District	1	Mediah	Connection Node	NaN	Abandoned Iron Mine, Abun
...	...	...	...	...	...	...	...
356	357	Yalt Canyon	1	Valencia	Connection Node	NaN	Shakatu, Gahaz Bandit's Lair
357	358	Yianaros's Field	1	Kamasylvia	Connection Node	NaN	Western Valtarra Mountains
358	359	Zagam Island	1	Margoria	Connection Node	NaN	Nada Island

359 rows × 7 columns

# Table 'subnode' that we obtained from web scraping bdodae.com
subnode = pd.read_sql('SELECT * FROM subnode', con = conn)
subnode

	SUBNODE_ID	NAME	CP	BASE_WORKLOAD	CURRENT_WORKLOAD	ITEM1	ITEM2	ITEM3	ITEM4	YIELD1	YIELD2	YIELD3	YIELD4	DISTANCE1	DISTANCE2	DISTANCE3	CITY1	CITY2	CITY3
0	1	Abandoned Iron Mine - Mining.1	3	200	250	Zinc Ore	Powder Of Time	Platinum Ore	None	7.86	1.86	0.95	NaN	1113	2063	NaN	Altinova	Tarif	None
1	2	Abandoned Iron Mine - Mining.2	3	500	100	Iron Ore	Powder Of Darkness	Rough Black Crystal	None	10.53	1.80	1.08	NaN	1113	2063	NaN	Altinova	Tarif	None
2	3	Ahto Farm - Farming.1	2	200	100	Cotton	Cotton Yarn	None	None	12.53	0.85	NaN	NaN	702	2000	2325.0	Tarif	Heidel	Altinova
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
169	170	Weenie Cabin - Gathering.1	2	140	140	Volcanic Umbrella Mushroom	Green Pendulous Mushroom	None	None	7.70	6.90	NaN	NaN	1608	3025	6005.0	Grana	Old Wisdom Tree	Trent
170	171	Weita Island - Fish Drying Yard.1	1	2400	1200	Dried Surfperch	Dried Bluefish	Dried Maomao	Dried Nibbler	3.60	1.80	0.90	0.08	1620	3291	3340.0	Iliya Island	Velia	Olvia
171	172	Wolf Hills - Lumbering.1	1	150	100	Ash Timber	None	None	None	4.43	NaN	NaN	NaN	340	1902	NaN	Olvia	Velia	None

172 rows × 19 columns

Christopher Wong

Black Desert Online Worker Optimization

Web Scraping

Tools of choice

Scraping item values

Scraping node data