A simple script using Selenium to download all the certicates of Udemy, as they do not provide API

There is room for improvement, mostly removing the time.sleep calls. That part of code is there because the initial load of the page does not provide that data, as it is query via an internal API and the data fill with JavaScript.


from selenium import webdriver
import time

import os
from urllib.parse import urlparse

import requests

USERNAME="xxxx@gmail.com"
PASSWORD="xxxxxx"

LOGIN_WEBPAGE="https://www.udemy.com/join/login-popup/?skip_suggest=1"
USERNAME_XPATH='//*[@id="email--1"]'
PASSWORD_XPATH='//*[@id="id_password"]'
LOGIN_BUTTON='//*[@id="submit-id-submit"]'

def do_login(driver):
    driver.get(LOGIN_WEBPAGE)
    driver.find_element_by_xpath(USERNAME_XPATH).send_keys(USERNAME)
    driver.find_element_by_xpath(PASSWORD_XPATH).send_keys(PASSWORD)
    driver.find_element_by_xpath(LOGIN_BUTTON).click()
    

def get_number_pages(driver, webpage):
    driver.get(webpage + '?p=1')
    time.sleep(10) # Some Javascript in Udemy makes detecting the courses slow
    elems = driver.find_elements_by_xpath("//a[@href]")
    number = 1
    for elem in elems:
        if webpage in elem.get_attribute("href"):
            text = elem.get_attribute("href")
            elems = text.split(webpage + '?p=')
            if len(elems) > 1:
                new_number = int(elems[1])
                if new_number > number:
                    number = new_number
    return number

COURSES_WEBPAGE='https://www.udemy.com/home/my-courses/'
CATEGORIES_COURSES=['learning/', 'archived/']

def get_courses_id_by_pagination(driver, webpage):
    driver.get(webpage)
    time.sleep(10) # Some Javascript in Udemy makes detecting the courses slow
    elems = driver.find_elements_by_xpath("//a[@href]")
    result = set()
    for elem in elems:
        if '?course_id=' in elem.get_attribute("href"):
            text = elem.get_attribute("href")
            elems = text.split('?course_id=')
            if len(elems) > 1:
                result.add(elems[1])
    return result

def get_courses_id_by_category(driver, webpage):
    number_of_pages = get_number_pages(driver, webpage)
    ids = set()
    for number in range(1, number_of_pages+1):
        result=get_courses_id_by_pagination(driver, webpage + '?p={}'.format(number))
        ids = result.union(ids)
    return ids

def get_courses_id(driver):
    ids = set()
    for category in CATEGORIES_COURSES:
        result = get_courses_id_by_category(driver, COURSES_WEBPAGE + category)
        ids = result.union(ids)
    return ids


COURSE_URL_REDIRECT = 'https://www.udemy.com/course-dashboard-redirect/?course_id='
CERTIFICATE_URL_PREFIX = 'https://www.udemy.com/certificate/'
CERTIFICATE_PDF_SCHEMA = 'https://udemy-certificate.s3.amazonaws.com/pdf/{}.pdf'

def get_certificate_url(driver, course_id):
    driver.get(COURSE_URL_REDIRECT + '{}'.format(course_id))
    time.sleep(5)
    elems = driver.find_elements_by_xpath("//a[@href]")
    certificate_url = None
    for elem in elems:
        if CERTIFICATE_URL_PREFIX in elem.get_attribute("href"):
            certificate_url = elem.get_attribute("href")
            break
    
    if certificate_url is None:
        return
    
    certificate_id = certificate_url.split('https://www.udemy.com/certificate/')[1].split('/')[0]
    return CERTIFICATE_PDF_SCHEMA.format(certificate_id)

DESTINATION = r"D:/Vault/Diplomas/Udemy/"

def download(url):
    a = urlparse(url)
    filename = os.path.basename(a.path)
    r = requests.get(url, allow_redirects=True)
    open(DESTINATION + filename, 'wb').write(r.content)
    

opts = webdriver.FirefoxOptions()
opts.binary_location = r"C:/Program Files/Mozilla Firefox/firefox.exe"
opts.add_argument("--headless")

driver = webdriver.Firefox(options=opts)

do_login(driver)
ids = get_courses_id(driver)
for course_id in ids:
    url = get_certificate_url(driver, course_id)
    if url is not None:
        download(url)
driver.close()