diff --git a/main.go b/main.go index 29f6603..84c423f 100644 --- a/main.go +++ b/main.go @@ -16,7 +16,7 @@ import ( func main() { // Load environment variables - godotenv.Load() + godotenv.Load() // TODO: I Don't think this does anything // Setup flags @@ -33,6 +33,7 @@ func main() { term := flag.String("term", "", "Alongside -coursebook, specifies the term to scrape, i.e. 23S") startPrefix := flag.String("startprefix", "", "Alongside -coursebook, specifies the course prefix to start scraping from, i.e. cp_span") resume := flag.Bool("resume", false, "Alongside -coursebook, signifies that scraping should begin at the last complete prefix and should not re-scrape existing data") + retry := flag.Int("retry", 0, "Alongside -coursebook, specifies how many times to retry before quitting") // Flag for profile scraping scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.") @@ -87,8 +88,8 @@ func main() { } defer logFile.Close() - // Set logging output destination to a SplitWriter that writes to both the log file and stdout - log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) + // Set logging output destination to a SplitWriter that writes to both the log file and stderr + log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) // TODO: Switch to stderr // Do verbose logging if verbose flag specified if *verbose { log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose) @@ -104,10 +105,7 @@ func main() { case *scrapeProfiles: scrapers.ScrapeProfiles(*outDir) case *scrapeCoursebook: - if *term == "" { - log.Panic("No term specified for coursebook scraping! Use -term to specify.") - } - scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume) + scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume, *retry) case *scrapeDiscounts: scrapers.ScrapeDiscounts(*outDir) case *cometCalendar: diff --git a/scrapers/coursebook.go b/scrapers/coursebook.go index 539b6a8..6264a2c 100644 --- a/scrapers/coursebook.go +++ b/scrapers/coursebook.go @@ -33,23 +33,80 @@ const ( reqThrottle = 400 * time.Millisecond prefixThrottle = 5 * time.Second httpTimeout = 10 * time.Second + networkWait = 30 * time.Second // TODO: UNUSED ) -// ScrapeCoursebook scrapes utd coursebook for the provided term (semester) -func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) { +// ScrapeCoursebook Scrapes utd coursebook for provided term with specified options +func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool, retry int) { + var lastErr error = nil + repeatErrCount := 0 + for repeatErrCount <= retry { // While instead? + + err := scrapeCoursebookInternal(term, startPrefix, outDir, resume) + + // No error + if err == nil { + return + } + + /* Non-retry Errors */ + + // Setup Error (such as invalid args) + var setupErr *utils.SetupError + if errors.As(err, &setupErr) { + log.Fatalf("Coursebook Scraping Setup Failed: %v", err) + } + + // Context canceled Error (such as when closing chromedp window) + if err.Error() == "context canceled" { + log.Fatalf("Coursebook Scraping Canceled, Exiting") + } + + /* Retry Coursebook Scraping */ + log.Printf("Coursebook Scraping Failed: %v", err) + + if fmt.Sprintf("%v", lastErr) == fmt.Sprintf("%v", err) { + repeatErrCount++ + } else { + repeatErrCount = 1 + } + + lastErr = err + + // TODO: handle netid (using setup error) + // TODO: handle network issues -> wait longer before restarting + } + + if retry != 0 { + log.Fatalf("Coursebook Scraping Failed %d times in a row with the same error, Exiting", retry+1) + } +} + +// scrapeCoursebookInternal scrapes utd coursebook for the provided term (semester) +func scrapeCoursebookInternal(term string, startPrefix string, outDir string, resume bool) error { + if term == "" { + return &utils.SetupError{Message: "No term specified for coursebook scraping! Use -term to specify."} + } if startPrefix != "" && !prefixRegex.MatchString(startPrefix) { - log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix) + return &utils.SetupError{Message: fmt.Sprintf("invalid starting prefix %s, must match format cp_{abcde}", startPrefix)} } if !termRegex.MatchString(term) { - log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term) + return &utils.SetupError{Message: fmt.Sprintf("invalid term %s, must match format {00-99}{s/f/u}", term)} } - scraper := newCoursebookScraper(term, outDir) + scraper, err := newCoursebookScraper(term, outDir) + if err != nil { + return err + } defer scraper.chromedpCancel() if resume && startPrefix == "" { // providing a starting prefix overrides the resume flag - startPrefix = scraper.lastCompletePrefix() + var err error + startPrefix, err = scraper.lastCompletePrefix() + if err != nil { + return &utils.SetupError{Message: fmt.Sprintf("failed to get last complete prefix while resuming: %v", err)} + } } log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes)) @@ -62,7 +119,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo start := time.Now() if err := scraper.ensurePrefixFolder(prefix); err != nil { - log.Fatal(err) + log.Panic(err) } var sectionIds []string @@ -76,7 +133,7 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo } if err != nil { - log.Fatalf("Error getting section ids for %s ", prefix) + log.Panicf("Error getting section ids for %s ", prefix) } if len(sectionIds) == 0 { @@ -89,10 +146,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo for _, sectionId := range sectionIds { content, err := scraper.getSectionContent(sectionId) if err != nil { - log.Fatalf("Error getting section content for section %s: %v", sectionId, err) + return fmt.Errorf("error getting section content for section %s: %v", sectionId, err) } if err := scraper.writeSection(prefix, sectionId, content); err != nil { - log.Fatalf("Error writing section %s: %v", sectionId, err) + log.Panicf("Error writing section %s: %v", sectionId, err) } time.Sleep(reqThrottle) } @@ -104,8 +161,10 @@ func ScrapeCoursebook(term string, startPrefix string, outDir string, resume boo log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries) if err := scraper.validate(); err != nil { - log.Fatal("Validating failed: ", err) + log.Panicf("Validating failed: %v", err) } + + return nil } type coursebookScraper struct { @@ -124,38 +183,45 @@ type coursebookScraper struct { totalScrapedSections int } -func newCoursebookScraper(term string, outDir string) *coursebookScraper { +func newCoursebookScraper(term string, outDir string) (*coursebookScraper, error) { ctx, cancel := utils.InitChromeDp() httpClient := &http.Client{ Timeout: httpTimeout, } //prefixes in alphabetical order for skip prefix flag - prefixes := utils.GetCoursePrefixes(ctx) + prefixes, err := utils.GetCoursePrefixes(ctx) + if err != nil { + return nil, err + } sort.Strings(prefixes) + coursebookHeaders, err := utils.RefreshToken(ctx) + if err != nil { + return nil, err + } return &coursebookScraper{ chromedpCtx: ctx, chromedpCancel: cancel, httpClient: httpClient, prefixes: prefixes, - coursebookHeaders: utils.RefreshToken(ctx), + coursebookHeaders: coursebookHeaders, term: term, outDir: outDir, prefixIdsCache: make(map[string][]string), - } + }, nil } // lastCompletePrefix returns the last prefix (alphabetical order) that contains // html files for all of its section ids. returns an empty string if there are no // complete prefixes -func (s *coursebookScraper) lastCompletePrefix() string { +func (s *coursebookScraper) lastCompletePrefix() (string, error) { if err := s.ensureOutputFolder(); err != nil { - log.Fatal(err) + return "", err } dir, err := os.ReadDir(filepath.Join(s.outDir, s.term)) if err != nil { - log.Fatalf("failed to read output directory: %v", err) + return "", fmt.Errorf("failed to read output directory: %w", err) } foundPrefixes := make([]string, 0, len(s.prefixes)) @@ -169,14 +235,14 @@ func (s *coursebookScraper) lastCompletePrefix() string { for _, prefix := range foundPrefixes { missing, err := s.getMissingIdsForPrefix(prefix) if err != nil { - log.Fatalf("Failed to get ids: %v", err) + return "", fmt.Errorf("failed to get ids: %w", err) } if len(missing) == 0 { - return prefix + return prefix, nil } time.Sleep(reqThrottle) } - return "" + return "", nil } // ensurePrefixFolder creates {outDir}/term if it does not exist @@ -235,7 +301,7 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err dir, err := os.ReadDir(path) if err != nil { - log.Panicf("Failed to access folder %s: %v", path, err) + return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err) } foundIds := make(map[string]bool) @@ -285,7 +351,7 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s err := utils.Retry(func() error { req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr)) if err != nil { - log.Fatalf("Http request failed: %v", err) + return fmt.Errorf("http request failed: %w", err) } req.Header = s.coursebookHeaders @@ -310,7 +376,13 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s return err }, retries, func(numRetries int) { utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName) - s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx) + coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx) + if err != nil { + // TODO: Since this is in a retry, perhaps we should implement this differently + utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err) + } + s.coursebookHeaders = coursebookHeaders + s.reqRetries++ //back off exponentially @@ -345,7 +417,7 @@ func (s *coursebookScraper) validate() error { log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix) if err := s.ensurePrefixFolder(prefix); err != nil { - log.Fatal(err) + log.Panic(err) } for _, id := range ids { diff --git a/utils/methods.go b/utils/methods.go index 70b8bfe..075404c 100644 --- a/utils/methods.go +++ b/utils/methods.go @@ -8,6 +8,7 @@ import ( "fmt" "io/fs" "log" + "math" "os" "path/filepath" "regexp" @@ -54,14 +55,14 @@ func InitChromeDp() (chromedpCtx context.Context, cancelFnc context.CancelFunc) } // RefreshToken logs into CourseBook and returns headers containing a fresh session token. -func RefreshToken(chromedpCtx context.Context) map[string][]string { +func RefreshToken(chromedpCtx context.Context) (map[string][]string, error) { netID, err := GetEnv("LOGIN_NETID") if err != nil { - panic(err) + return nil, err } password, err := GetEnv("LOGIN_PASSWORD") if err != nil { - panic(err) + return nil, err } delayedRetryCallback := func(numRetries int) { @@ -81,13 +82,13 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { chromedp.Click(`button#login-button`), ) if r != nil && r.Status != 200 { - return errors.New("Non-200 response status code") + return fmt.Errorf("non-200 response status code: %d", r.Status) } return err }, 3, delayedRetryCallback) if err != nil { - panic(err) + return nil, err // TODO: we should return different error or error types based on the response code } time.Sleep(250 * time.Millisecond) @@ -124,7 +125,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { }, 3, delayedRetryCallback) if err != nil { - panic(err) + return nil, err } return map[string][]string{ @@ -135,7 +136,7 @@ func RefreshToken(chromedpCtx context.Context) map[string][]string { "Content-Type": {"application/x-www-form-urlencoded"}, "Cookie": cookieStrs, "Connection": {"keep-alive"}, - } + }, nil } // RefreshAstraToken signs into Astra and returns headers containing authentication cookies. @@ -288,30 +289,53 @@ func Retry(action func() error, maxRetries int, retryCallback func(numRetries in } // GetCoursePrefixes retrieves all course prefix values from CourseBook. -func GetCoursePrefixes(chromedpCtx context.Context) []string { +func GetCoursePrefixes(chromedpCtx context.Context) ([]string, error) { // Might need to refresh the token every time we get new course prefixes in the future // refreshToken(chromedpCtx) var coursePrefixes []string log.Println("Finding course prefixes...") - // Get option elements for course prefix dropdown - _, err := chromedp.RunResponse(chromedpCtx, - chromedp.Navigate("https://coursebook.utdallas.edu"), - chromedp.QueryAfter("select#combobox_cp option", - func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { - for _, node := range nodes[1:] { - coursePrefixes = append(coursePrefixes, node.AttributeValue("value")) - } - return nil - }, - ), - ) + var err error + maxRetries := 10 + + for attempt := 1; attempt <= maxRetries; attempt++ { + coursePrefixes = nil // Reset course prefixes for each attempt + + // Get option elements for course prefix dropdown + _, err = chromedp.RunResponse(chromedpCtx, + chromedp.Navigate("https://coursebook.utdallas.edu"), + chromedp.QueryAfter("select#combobox_cp option", // TODO: TEST IF THIS DOESNT EXIST MAYBE + func(ctx context.Context, _ runtime.ExecutionContextID, nodes ...*cdp.Node) error { + for _, node := range nodes[1:] { + coursePrefixes = append(coursePrefixes, node.AttributeValue("value")) + } + return nil + }, + ), + ) + + if err == nil { + break // Success, exit retry loop + } + + // Only retry on page load error + if !strings.Contains(err.Error(), "page load error") { + return nil, err // Return early for unrecognized errors + } + log.Printf("%v", err) // TODO: Shouold be verbose + + // Exponential backoff + wait := time.Duration(math.Pow(2, float64(attempt))) * time.Second // TODO: Update other backoff to print seconds as well + log.Printf("Coarsbook load error, waiting %v (attempt %d of %d)", wait, attempt, maxRetries) + time.Sleep(wait) + } + if err != nil { - log.Panic(err) + return nil, fmt.Errorf("failed to fetch course prefixes after %d attempts: %w", maxRetries, err) } log.Printf("Found the %d course prefixes!", len(coursePrefixes)) - return coursePrefixes + return coursePrefixes, nil } // ConvertFromInterface attempts to convert a value into the requested type and returns a pointer when successful. diff --git a/utils/types.go b/utils/types.go new file mode 100644 index 0000000..d80c786 --- /dev/null +++ b/utils/types.go @@ -0,0 +1,17 @@ +package utils + +type SetupError struct { + Message string +} + +func (e *SetupError) Error() string { + return e.Message +} + +type NetworkError struct { + Message string +} + +func (e *NetworkError) Error() string { + return e.Message +} diff --git a/utils/utils_test.go b/utils/utils_test.go index aeb5ba2..a93c48b 100644 --- a/utils/utils_test.go +++ b/utils/utils_test.go @@ -40,7 +40,10 @@ func TestRefreshToken(t *testing.T) { ctx, cancel := InitChromeDp() defer cancel() // Try refreshing token - headers := RefreshToken(ctx) + headers, err := RefreshToken(ctx) + if err != nil { + t.Errorf("Failed to refresh token: %v", err) + } // Make sure we successfully got a PTGSESSID cookie for _, cookie := range headers["Cookie"] { if strings.HasPrefix(cookie, "PTGSESSID") { @@ -48,5 +51,5 @@ func TestRefreshToken(t *testing.T) { } } // Fail if no PTGSESSID cookie found - t.Fatalf("Failed to get PTGSESSID cookie from RefreshToken!") + t.Errorf("Failed to get PTGSESSID cookie from RefreshToken!") }