2 minutes
How to download all certificates of UDEMY
A simple script using Selenium to download all the certicates of Udemy, as they do not provide API
There is room for improvement, mostly removing the time.sleep calls. That part of code is there because the initial load of the page does not provide that data, as it is query via an internal API and the data fill with JavaScript.
from selenium import webdriver
import time
import os
from urllib.parse import urlparse
import requests
USERNAME="xxxx@gmail.com"
PASSWORD="xxxxxx"
LOGIN_WEBPAGE="https://www.udemy.com/join/login-popup/?skip_suggest=1"
USERNAME_XPATH='//*[@id="email--1"]'
PASSWORD_XPATH='//*[@id="id_password"]'
LOGIN_BUTTON='//*[@id="submit-id-submit"]'
def do_login(driver):
driver.get(LOGIN_WEBPAGE)
driver.find_element_by_xpath(USERNAME_XPATH).send_keys(USERNAME)
driver.find_element_by_xpath(PASSWORD_XPATH).send_keys(PASSWORD)
driver.find_element_by_xpath(LOGIN_BUTTON).click()
def get_number_pages(driver, webpage):
driver.get(webpage + '?p=1')
time.sleep(10) # Some Javascript in Udemy makes detecting the courses slow
elems = driver.find_elements_by_xpath("//a[@href]")
number = 1
for elem in elems:
if webpage in elem.get_attribute("href"):
text = elem.get_attribute("href")
elems = text.split(webpage + '?p=')
if len(elems) > 1:
new_number = int(elems[1])
if new_number > number:
number = new_number
return number
COURSES_WEBPAGE='https://www.udemy.com/home/my-courses/'
CATEGORIES_COURSES=['learning/', 'archived/']
def get_courses_id_by_pagination(driver, webpage):
driver.get(webpage)
time.sleep(10) # Some Javascript in Udemy makes detecting the courses slow
elems = driver.find_elements_by_xpath("//a[@href]")
result = set()
for elem in elems:
if '?course_id=' in elem.get_attribute("href"):
text = elem.get_attribute("href")
elems = text.split('?course_id=')
if len(elems) > 1:
result.add(elems[1])
return result
def get_courses_id_by_category(driver, webpage):
number_of_pages = get_number_pages(driver, webpage)
ids = set()
for number in range(1, number_of_pages+1):
result=get_courses_id_by_pagination(driver, webpage + '?p={}'.format(number))
ids = result.union(ids)
return ids
def get_courses_id(driver):
ids = set()
for category in CATEGORIES_COURSES:
result = get_courses_id_by_category(driver, COURSES_WEBPAGE + category)
ids = result.union(ids)
return ids
COURSE_URL_REDIRECT = 'https://www.udemy.com/course-dashboard-redirect/?course_id='
CERTIFICATE_URL_PREFIX = 'https://www.udemy.com/certificate/'
CERTIFICATE_PDF_SCHEMA = 'https://udemy-certificate.s3.amazonaws.com/pdf/{}.pdf'
def get_certificate_url(driver, course_id):
driver.get(COURSE_URL_REDIRECT + '{}'.format(course_id))
time.sleep(5)
elems = driver.find_elements_by_xpath("//a[@href]")
certificate_url = None
for elem in elems:
if CERTIFICATE_URL_PREFIX in elem.get_attribute("href"):
certificate_url = elem.get_attribute("href")
break
if certificate_url is None:
return
certificate_id = certificate_url.split('https://www.udemy.com/certificate/')[1].split('/')[0]
return CERTIFICATE_PDF_SCHEMA.format(certificate_id)
DESTINATION = r"D:/Vault/Diplomas/Udemy/"
def download(url):
a = urlparse(url)
filename = os.path.basename(a.path)
r = requests.get(url, allow_redirects=True)
open(DESTINATION + filename, 'wb').write(r.content)
opts = webdriver.FirefoxOptions()
opts.binary_location = r"C:/Program Files/Mozilla Firefox/firefox.exe"
opts.add_argument("--headless")
driver = webdriver.Firefox(options=opts)
do_login(driver)
ids = get_courses_id(driver)
for course_id in ids:
url = get_certificate_url(driver, course_id)
if url is not None:
download(url)
driver.close()