Skip to content
12 changes: 5 additions & 7 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (

func main() {
// Load environment variables
godotenv.Load()
godotenv.Load() // TODO: I Don't think this does anything

// Setup flags

Expand All @@ -33,6 +33,7 @@ func main() {
term := flag.String("term", "", "Alongside -coursebook, specifies the term to scrape, i.e. 23S")
startPrefix := flag.String("startprefix", "", "Alongside -coursebook, specifies the course prefix to start scraping from, i.e. cp_span")
resume := flag.Bool("resume", false, "Alongside -coursebook, signifies that scraping should begin at the last complete prefix and should not re-scrape existing data")
retry := flag.Int("retry", 0, "Alongside -coursebook, specifies how many times to retry before quitting")

// Flag for profile scraping
scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.")
Expand Down Expand Up @@ -87,8 +88,8 @@ func main() {
}

defer logFile.Close()
// Set logging output destination to a SplitWriter that writes to both the log file and stdout
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout))
// Set logging output destination to a SplitWriter that writes to both the log file and stderr
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) // TODO: Switch to stderr
// Do verbose logging if verbose flag specified
if *verbose {
log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose)
Expand All @@ -104,10 +105,7 @@ func main() {
case *scrapeProfiles:
scrapers.ScrapeProfiles(*outDir)
case *scrapeCoursebook:
if *term == "" {
log.Panic("No term specified for coursebook scraping! Use -term to specify.")
}
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume)
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume, *retry)
case *scrapeDiscounts:
scrapers.ScrapeDiscounts(*outDir)
case *cometCalendar:
Expand Down
122 changes: 97 additions & 25 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,80 @@ const (
reqThrottle = 400 * time.Millisecond
prefixThrottle = 5 * time.Second
httpTimeout = 10 * time.Second
networkWait = 30 * time.Second // TODO: UNUSED
)

// ScrapeCoursebook scrapes utd coursebook for the provided term (semester)
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) {
// ScrapeCoursebook Scrapes utd coursebook for provided term with specified options
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool, retry int) {
var lastErr error = nil
repeatErrCount := 0
for repeatErrCount <= retry { // While instead?

err := scrapeCoursebookInternal(term, startPrefix, outDir, resume)

// No error
if err == nil {
return
}

/* Non-retry Errors */

// Setup Error (such as invalid args)
var setupErr *utils.SetupError
if errors.As(err, &setupErr) {
log.Fatalf("Coursebook Scraping Setup Failed: %v", err)
}

// Context canceled Error (such as when closing chromedp window)
if err.Error() == "context canceled" {
log.Fatalf("Coursebook Scraping Canceled, Exiting")
}

/* Retry Coursebook Scraping */
log.Printf("Coursebook Scraping Failed: %v", err)

if fmt.Sprintf("%v", lastErr) == fmt.Sprintf("%v", err) {
repeatErrCount++
} else {
repeatErrCount = 1
}

lastErr = err

// TODO: handle netid (using setup error)
// TODO: handle network issues -> wait longer before restarting
}

if retry != 0 {
log.Fatalf("Coursebook Scraping Failed %d times in a row with the same error, Exiting", retry+1)
}
}

// scrapeCoursebookInternal scrapes utd coursebook for the provided term (semester)
func scrapeCoursebookInternal(term string, startPrefix string, outDir string, resume bool) error {
if term == "" {
return &utils.SetupError{Message: "No term specified for coursebook scraping! Use -term to specify."}
}
if startPrefix != "" && !prefixRegex.MatchString(startPrefix) {
log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix)
return &utils.SetupError{Message: fmt.Sprintf("invalid starting prefix %s, must match format cp_{abcde}", startPrefix)}
}
if !termRegex.MatchString(term) {
log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term)
return &utils.SetupError{Message: fmt.Sprintf("invalid term %s, must match format {00-99}{s/f/u}", term)}
}

scraper := newCoursebookScraper(term, outDir)
scraper, err := newCoursebookScraper(term, outDir)
if err != nil {
return err
}
defer scraper.chromedpCancel()

if resume && startPrefix == "" {
// providing a starting prefix overrides the resume flag
startPrefix = scraper.lastCompletePrefix()
var err error
startPrefix, err = scraper.lastCompletePrefix()
if err != nil {
return &utils.SetupError{Message: fmt.Sprintf("failed to get last complete prefix while resuming: %v", err)}
}
}

log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes))
Expand All @@ -62,7 +119,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo

start := time.Now()
if err := scraper.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
log.Panic(err)
}

var sectionIds []string
Expand All @@ -76,7 +133,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
}

if err != nil {
log.Fatalf("Error getting section ids for %s ", prefix)
log.Panicf("Error getting section ids for %s ", prefix)
}

if len(sectionIds) == 0 {
Expand All @@ -89,10 +146,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
for _, sectionId := range sectionIds {
content, err := scraper.getSectionContent(sectionId)
if err != nil {
log.Fatalf("Error getting section content for section %s: %v", sectionId, err)
return fmt.Errorf("error getting section content for section %s: %v", sectionId, err)
}
if err := scraper.writeSection(prefix, sectionId, content); err != nil {
log.Fatalf("Error writing section %s: %v", sectionId, err)
log.Panicf("Error writing section %s: %v", sectionId, err)
}
time.Sleep(reqThrottle)
}
Expand All @@ -104,8 +161,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo
log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries)

if err := scraper.validate(); err != nil {
log.Fatal("Validating failed: ", err)
log.Panicf("Validating failed: %v", err)
}

return nil
}

type coursebookScraper struct {
Expand All @@ -124,38 +183,45 @@ type coursebookScraper struct {
totalScrapedSections int
}

func newCoursebookScraper(term string, outDir string) *coursebookScraper {
func newCoursebookScraper(term string, outDir string) (*coursebookScraper, error) {
ctx, cancel := utils.InitChromeDp()
httpClient := &http.Client{
Timeout: httpTimeout,
}

//prefixes in alphabetical order for skip prefix flag
prefixes := utils.GetCoursePrefixes(ctx)
prefixes, err := utils.GetCoursePrefixes(ctx)
if err != nil {
return nil, err
}
sort.Strings(prefixes)
coursebookHeaders, err := utils.RefreshToken(ctx)
if err != nil {
return nil, err
}
return &coursebookScraper{
chromedpCtx: ctx,
chromedpCancel: cancel,
httpClient: httpClient,
prefixes: prefixes,
coursebookHeaders: utils.RefreshToken(ctx),
coursebookHeaders: coursebookHeaders,
term: term,
outDir: outDir,
prefixIdsCache: make(map[string][]string),
}
}, nil
}

// lastCompletePrefix returns the last prefix (alphabetical order) that contains
// html files for all of its section ids. returns an empty string if there are no
// complete prefixes
func (s *coursebookScraper) lastCompletePrefix() string {
func (s *coursebookScraper) lastCompletePrefix() (string, error) {
if err := s.ensureOutputFolder(); err != nil {
log.Fatal(err)
return "", err
}

dir, err := os.ReadDir(filepath.Join(s.outDir, s.term))
if err != nil {
log.Fatalf("failed to read output directory: %v", err)
return "", fmt.Errorf("failed to read output directory: %w", err)
}

foundPrefixes := make([]string, 0, len(s.prefixes))
Expand All @@ -169,14 +235,14 @@ func (s *coursebookScraper) lastCompletePrefix() string {
for _, prefix := range foundPrefixes {
missing, err := s.getMissingIdsForPrefix(prefix)
if err != nil {
log.Fatalf("Failed to get ids: %v", err)
return "", fmt.Errorf("failed to get ids: %w", err)
}
if len(missing) == 0 {
return prefix
return prefix, nil
}
time.Sleep(reqThrottle)
}
return ""
return "", nil
}

// ensurePrefixFolder creates {outDir}/term if it does not exist
Expand Down Expand Up @@ -235,7 +301,7 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err

dir, err := os.ReadDir(path)
if err != nil {
log.Panicf("Failed to access folder %s: %v", path, err)
return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err)
}

foundIds := make(map[string]bool)
Expand Down Expand Up @@ -285,7 +351,7 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
err := utils.Retry(func() error {
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
log.Fatalf("Http request failed: %v", err)
return fmt.Errorf("http request failed: %w", err)
}
req.Header = s.coursebookHeaders

Expand All @@ -310,7 +376,13 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
return err
}, retries, func(numRetries int) {
utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName)
s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx)
coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx)
if err != nil {
// TODO: Since this is in a retry, perhaps we should implement this differently
utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err)
}
s.coursebookHeaders = coursebookHeaders

s.reqRetries++

//back off exponentially
Expand Down Expand Up @@ -345,7 +417,7 @@ func (s *coursebookScraper) validate() error {
log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix)

if err := s.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
log.Panic(err)
}

for _, id := range ids {
Expand Down
Loading
Loading