Schema

SQL data

Web Scraping

I web scraped the website bdodae.com for data to use for my project. Since I am optimizing profits I need item values and information about the nodes.

Tools of choice

Scraping item values

Item page

# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import time

# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
                  user="yourusername",
                  passwd="yourpassword",
                  db="bdodae_new")

x = conn.cursor()

# Create table for items
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS item (
    item_id INT NOT NULL PRIMARY KEY,
    name CHAR(50) NOT NULL)
    """)
    conn.commit()
except:
    conn.rollback()

# Create table for prices
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS prices (
    item_id INT NOT NULL,
    FOREIGN KEY (item_id) REFERENCES item(item_id),
    user_average INT,
    recent_value INT,
    vendor_sell INT,
    vendor_buy INT DEFAULT NULL)
    """)
    conn.commit()
except:
    conn.rollback()


# Scraping item values
url = "https://www.bdodae.com/nodes/index.php?page=items"

# Create a new Firefox session and go to the URL above
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)

# Not mandatory
#driver.maximize_window()
# Initial page scrape
soup_values = BeautifulSoup(driver.page_source, 'lxml')

# Getting item names
items_soup = soup_values.find_all('a', class_ = "item_popup")

# Initialize empty list we are going to append item names to
items = []
for item in items_soup:
    item_name = item.get_text()
    items.append(item_name)

# Define a function to clean the text/values
def clean_values(values):
    for i in range(len(values)):
        values[i] = re.search("(\d+)", values[i].get_text()).group(0)
    if (len(values) == 3):
        # Add NULL to vendor_buy if it does not have one
        values.insert(2, None)
    
# Find the value buttons to click on
value_buttons = driver.find_elements_by_class_name('value_button')
wait = WebDriverWait(driver, 10)
# Big loop that:
# 1.) Clicks on the values boxes to open up the hidden table
# 2.) Scrapes the page, including the now revealed hidden table
# 3.) Finds the values in the hidden table and cleans it
# 4.) Writes the item name and corresponding values into a MySQL database
# 5.) Waits until the hidden table is actually visible before we click off of it to close it
# 6.) Repeats steps 1-5 for all items/rows on the page
for item, button, i in zip(items, value_buttons, range(len(items))):
    # Click on value box to open hidden table
    button.click()
    # If it clicks too fast, it'll throw an error since the table will cover other values
    time.sleep(1)
    # Now we scrape the hidden table
    soup_values = BeautifulSoup(driver.page_source, 'lxml')
    values = soup_values.find_all('div', class_ = "value_extra value_extra_on")[-1].find_all('div', class_ = "value_option")
    # Clean the values output
    clean_values(values)
    values_list = [item] + values
    # Safety check for when our code stops or does not find the values
    if (len(values_list) != 5): 
        print("ERROR missing values") 
        print(values_list)
        break
    # Input data into database
    try:
        x.execute(
        """
        INSERT INTO item (item_id, name)
        VALUES (%s, %s)
        """, [i] + [values_list[0]])
        x.execute(
        """
        INSERT INTO prices (item_id, user_average, recent_value, vendor_buy, vendor_sell)
        VALUES (%s, %s, %s, %s, %s)
        """, [i] + values_list[1:])
        conn.commit()
    except:
        conn.rollback()
    # Wait until hidden table pops up
    # If one of the values in the table is a duplicate, then the CSS selector has a 3 instead of 2 at the end
    # Weird, but it's just how it is
    #if (len(set(values_list)) == 3 or len(set(values_list)) == 4):
    #    element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(3)")))
    #else:
    #    element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "tr.search:nth-child(" + str(i+2) + ") > td:nth-child(2) > div:nth-child(1) > div:nth-child(2)")))
    # Click off the values box to close the hidden table
    off_button = driver.find_element_by_css_selector("tr.search:nth-child(" + str(i+2) + ") > td:nth-child(3)")
    off_button.click()
    # As of 10/24/2018 reset_actions() does NOT clear locally in Firefox
    # So there will be lots of extra random clicking if you run this in Firefox
    #action.move_to_element_with_offset(button, -5, 0).click().perform()
    #action.reset_actions()
    time.sleep(1)
pd.options.display.max_rows = 6
# Table 'item' that we obtained from web scraping bdodae.com
item = pd.read_sql('SELECT * FROM item', con = conn)
item
item_id name
0 0 Acacia Sap
1 1 Acacia Timber
2 2 Aloe
... ... ...
210 210 White Umbrella Mushroom
211 211 Withered Leaf
212 212 Zinc Ore

213 rows × 2 columns

pd.options.display.max_rows = 6
# Table 'prices' that we obtained from web scraping bdodae.com
prices = pd.read_sql('SELECT * FROM prices', con = conn)
prices
ITEM_ID ITEM USER_AVERAGE RECENT_VALUE VENDOR_SELL VENDOR_BUY
0 1 Acacia Sap 387 662 70 NaN
1 2 Acacia Timber 874 907 105 NaN
2 3 Aloe 212 174 40 NaN
... ... ... ... ... ... ...
198 199 White Umbrella Mushroom 273 531 46 NaN
199 200 Withered Leaf 110 110 110 NaN
200 201 Zinc Ore 1016 1058 90 NaN

201 rows × 6 columns

Scraping node data

Node page

# Import packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
from itertools import chain
import re
import pandas as pd
import os
import time

# Launch URL
url = "https://www.bdodae.com/"

# Create a new Firefox session
driver = webdriver.Firefox()
driver.implicitly_wait(30)
driver.get(url)

python_button = driver.find_element_by_id("nodes_nav")
python_button.click() # Click node link
# Storing page source in a variable
soup_level1 = BeautifulSoup(driver.page_source, 'lxml')

# Grab all page links to nodes
links = soup_level1.find_all('a', {'href' : re.compile("^index\.php\?node"), 'class' : False})
# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
                  user="yourusername",
                  passwd="yourpassword",
                  db="bdodae_new")

x = conn.cursor()

# Create table for nodes
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS node (
    node_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    name CHAR(50) NOT NULL,
    cp INT DEFAULT 0,
    area CHAR(50),
    type CHAR(50),
    region_mod_percent FLOAT DEFAULT NULL,
    connections CHAR(255))
    """)
    conn.commit()
except:
    conn.rollback()

# Create table for subnodes
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS subnode (
    subnode_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    name CHAR(50) NOT NULL,
    cp INT DEFAULT 0,
    base_workload INT DEFAULT NULL,
    current_workload INT DEFAULT NULL,
    node_id INT NOT NULL,
    FOREIGN KEY (node_id) REFERENCES node(node_id))
    """)
    conn.commit()
except:
    conn.rollback()

# Create tables for cities
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS city (
    city_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    name CHAR(50) NOT NULL)
    """)
    conn.commit()
except:
    conn.rollback()

# Create tables for yields (items) and distances (cities)
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS yield (
    yield_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    amount INT DEFAULT NULL,
    item_id INT NOT NULL,
    FOREIGN KEY (item_id) REFERENCES item(item_id))
    """)
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS distance (
    distance_id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,
    amount INT DEFAULT NULL,
    city_id INT NOT NULL,
    FOREIGN KEY (city_id) REFERENCES city(city_id))
    """)
    conn.commit()
except:
    conn.rollback()

# Create junction tables for subnode/city and subnode/item
try:
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS subnode_item (
    subnode_id INT NOT NULL,
    FOREIGN KEY (subnode_id) REFERENCES subnode(subnode_id),
    item_id INT NOT NULL,
    FOREIGN KEY (item_id) REFERENCES item(item_id))
    """)
    x.execute(
    """
    CREATE TABLE IF NOT EXISTS subnode_city (
    subnode_id INT NOT NULL,
    FOREIGN KEY (subnode_id) REFERENCES subnode(subnode_id),
    city_id INT NOT NULL,
    FOREIGN KEY (city_id) REFERENCES city(city_id))
    """)
    conn.commit()
except:
    conn.rollback()

Master

# Go through every node page
for link in links:
    # Get node link
    link = link.get_text()
    
    # Click on link
    python_button = driver.find_element_by_link_text(link)
    python_button.click()
    
    # soup_level 2 contains the page source for the node clicked on
    soup_level2 = BeautifulSoup(driver.page_source, 'lxml')
    
    # General node info
    node_title = soup_level2.find(class_ = 'n_title_link').get_text()
    node_name = re.split('\s(?=\dCP)', node_title)[0]
    node_cp = re.split('\s(?=\dCP)', node_title)[1][0]
    node_area = soup_level2.find(class_ = 'n_area').get_text()
    node_type = soup_level2.find(class_ = 'n_type').get_text()
    node_connections = re.split('Connected: ', soup_level2.find(class_ = 'n_connected').get_text())[1]
    # Check if region modifier exists
    if (soup_level2.find(class_ = 'n_region')):
        node_region_mod = re.search('\d', soup_level2.find(class_ = 'n_region').get_text()).group(0)
    else:
        node_region_mod = None

    # Combine info into one variable
    node_list = [node_name,
                 node_cp,
                 node_area,
                 node_type,
                 node_region_mod,
                 node_connections]

    # Insert into MySQL database 'node'
    try:
        x.execute(
        """
        INSERT INTO node (name, cp, area, type, region_mod_percent, connections)
        VALUES (%s, %s, %s, %s, %s, %s)
        """, node_list)
        conn.commit()
    except:
        conn.rollback()
    
    # node_id is the primary key for the node table and foreign key to the subnode table
    node_id = pd.read_sql("SELECT LAST_INSERT_ID();", con = conn).iloc[0, 0]

    # Get subnode information if it exists
    # Otherwise this part does nothing
    node_subnode_subtype = soup_level2.find_all(class_ = 's_subtype')
    node_subnode_workload = soup_level2.find_all(class_ = 's_workload')
    node_subnode_tables = soup_level2.find_all('table', class_ = re.compile("^(?!worker_select_)"))
    # Check if there are subnode tables
    if node_subnode_tables:
        node_subnode_tables = pd.read_html(str(node_subnode_tables[1:]), header = 0)

    node_subnode_name = []
    node_subnode_cp = []
    base_workload = []
    current_workload = []
    for text, i in zip(node_subnode_subtype, range(len(node_subnode_subtype))):
        node_subnode_name = [node_name + " - " + re.split('\s(?=\dCP)', text.get_text())[0] + "." + str(i+1)]
        node_subnode_cp = [re.split('\s(?=\dCP)', text.get_text())[1][0]]
        # Workload information should come in pairs: base and current workload
        if (len(node_subnode_workload) % 2) == 0:
            base_workload = [re.search('\d+', node_subnode_workload[2*i].get_text()).group(0)]
            current_workload = [re.search('\d+', node_subnode_workload[(2*i)+1].get_text()).group(0)]
        try:
            x.execute("""
            INSERT INTO subnode (name, cp, base_workload, current_workload, node_id)
            VALUES (%s, %s, %s, %s, %s)
            """, [node_subnode_name,
                  node_subnode_cp,
                  base_workload,
                  current_workload,
                  node_id])
            conn.commit()
        except:
            conn.rollback()
        
        # subnode_id is the primary key of subnode and foreign keys to subnode_city and subnode_item
        subnode_id = pd.read_sql("SELECT LAST_INSERT_ID();", con = conn).iloc[0, 0]
        
        # Subnode tables should come in pairs: one containing items and yields, other containing distances and cities
        if (len(node_subnode_tables) % 2) == 0:
            # Get all items for a subnode
            node_subnode_item = node_subnode_tables[2*i]["Item"]
            # Get all item yields for a subnode
            node_subnode_yield = node_subnode_tables[2*i]["Avg"]
            for item, yield_ in zip(node_subnode_item, node_subnode_yield):
                # item_id is a primary key for item and foreign key to subnode_item
                # yield_id is primary key to yield; item_id is foreign key to item.item_id primary key
                item_id = pd.read_sql("SELECT item_id FROM item WHERE item.name = %(item)s;", con = conn, params = {"item": item}).iloc[0, 0]
                try:
                    x.execute(
                    """
                    INSERT INTO subnode_item (subnode_id, item_id)
                    VALUES (%s, %s)
                    """, [subnode_id, item_id])
                    x.execute(
                    """
                    INSERT INTO yield (amount, item_id)
                    VALUES (%s, %s)
                    """, [yield_, item_id])
                    conn.commit()
                except:
                    conn.rollback()
            
            # Get all cities near subnode
            node_city = node_subnode_tables[(2*i)+1]["City"]
            # Get all distances of close cities
            node_distance = node_subnode_tables[(2*i)+1]["Range"]
            for city, distance in zip(node_city, node_distance):
                # If a city is not in the city table, we add it to the table
                if (pd.read_sql("SELECT city_id FROM city WHERE city.name = %(city)s;", con = conn, params = {"city": city}).empty):
                    try:
                        x.execute("""
                        INSERT INTO city (name)
                        VALUES (%s)
                        """, [city])
                        conn.commit()
                    except:
                        conn.rollback()
                city_id = pd.read_sql("SELECT city_id FROM city WHERE city.name = %(city)s", con = conn, params = {"city": city}).iloc[0, 0]
                try:
                    x.execute("""
                    INSERT INTO subnode_city (subnode_id, city_id)
                    VALUES (%s, %s)
                    """, [subnode_id, city_id])
                    x.execute("""
                    INSERT INTO distance (amount, city_id)
                    VALUES (%s, %s)
                    """, [distance, city_id])
                    conn.commit()
                except:
                    conn.rollback()
                
    # Back it up
    driver.execute_script("window.history.go(-1)")

# Server Connection to MySQL:
import MySQLdb
conn = MySQLdb.connect(host= "localhost",
                  user="yourusername",
                  passwd="yourpassword",
                  db="bdodae_new")

x = conn.cursor()

# Post-processing cleaning
x.execute(
"""
UPDATE node
    SET connections = 'North Kaia Ferry, Catfishman Camp, Calpheon Castle'
    WHERE name = "Calpheon Castle Site";
"""
)
conn.commit()