How Golang is Used for Web Scraping with Concurrency?

Data Flow Designs

Use Cases

Code Highlights

package utils import ( "context" "fmt" "io" "net/http" "net/url" "strings" "time" "github.com/PuerkitoBio/goquery" ) // GetPage call the client page by HTTP request and extract the body to HTML document. func GetPage(ctx context.Context, method, siteURL string, cookies []*http.Cookie, headers, formDatas map[string]string, timeout int) (*goquery.Document, []*http.Cookie, error) { // This function can handle both all methods. // Initiate this body variable as nil for method that doesn't required body. body := io.Reader(nil) // If the request contain form-data, add the form-data parameters to the body. if len(formDatas) > 0 { form := url.Values{} for k, v := range formDatas { form.Add(k, v) } body = strings.NewReader(form.Encode()) } // Create a new HTTP request with context. req, err := http.NewRequestWithContext(ctx, method, siteURL, body) if err != nil { return nil, nil, fmt.Errorf("failed to create http request context: %w", err) } // If the request contain headers, add the header parameters. if len(headers) > 0 { for k, v := range headers { req.Header.Add(k, v) } } // If the request contain cookies, add the cookie parameters. if len(cookies) > 0 { for _, c := range cookies { req.AddCookie(c) } } // Use the default timeout if the timeout parameter isn't configured. reqTimeout := 10 * time.Second if timeout != 0 { reqTimeout = time.Duration(timeout) * time.Second } // Use default http Client. httpClient := &http.Client{ Transport: http.DefaultTransport, CheckRedirect: nil, Jar: nil, Timeout: reqTimeout, } // Execute the request. resp, err := httpClient.Do(req) if err != nil { return nil, nil, fmt.Errorf("failed to execute http request: %w", err) } // Close the response body defer func() { _ = resp.Body.Close() }() // // Parsing response body to HTML document reader. doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, nil, fmt.Errorf("failed to parse html: %w", err) } // Return HTML doc, cookies. return doc, resp.Cookies(), nil
// getCurrencyHistory gets the currency value on a specific date. func getCurrencyHistory(ctx context.Context, from, to, date string) (*entities.CurrencyHistory, error) { urlValues := url.Values{ "from": {to}, // Reverse `from` and `to` due to easily parse the currency value. "amount": {"1"}, "date": {date}, } siteURL := fmt.Sprintf("https://www.x-rates.com/historical/?%s", urlValues.Encode()) // Scrape the page. doc, _, err := utils.GetPage(ctx, http.MethodGet, siteURL, nil, nil, nil, 0) if err != nil { return nil, err } var currencyHistory *entities.CurrencyHistory // Scrape the currency value. doc.Find(".ratesTable tbody tr td").EachWithBreak(func(i int, s *goquery.Selection) bool { // Scrap the attribute href value from `a` tag HTML. // https://www.x-rates.com/graph/?from=JPY&to=IDR // Ignore exists value due to also will check in next line. href, _ := s.Find("a").Attr("href") // Reverse `from` and `to` due to easily parse the currency value. if !strings.Contains(href, "to="+from) { return true } // If the target currency match, scrape the text value. valueString := s.Find("a").Text() value, err := strconv.ParseFloat(valueString, 64) if err != nil { return true } currencyHistory = &entities.CurrencyHistory{ Date: date, Value: value, } return false }) return currencyHistory, nil }
/ getCurrencyHistories gets the currencies value on a range date. func getCurrencyHistories(ctx context.Context, start, end time.Time, from, to string) ([]*entities.CurrencyHistory, error) { // Get the number of days between start and end date. days := int(end.Sub(start).Hours()/24) + 1 currencyHistories := make([]*entities.CurrencyHistory, days) eg, ctx := errgroup.WithContext(ctx) idx := 0 for d := start; d.After(end) == false; d = d.AddDate(0, 0, 1) { // Defined new variable to avoid mismatch value when using goroutine. d := d i := idx // Concurrently gets the value on specific date. eg.Go(func() (err error) { currencyHistory, err := getCurrencyHistory(ctx, from, to, d.Format("2006-01-02")) currencyHistories[i] = currencyHistory return err }) idx++ } // Wait all request finished and check the error. if err := eg.Wait(); err != nil { return nil, err } return currencyHistories, nil }
// getCurrencyHistories gets the currencies value on a range date. func getCurrencyHistories(ctx context.Context, start, end time.Time, from, to string) ([]*entities.CurrencyHistory, error) { // Get the number of days between start and end date. days := int(end.Sub(start).Hours()/24) + 1 currencyHistories := make([]*entities.CurrencyHistory, days) idx := 0 for d := start; d.After(end) == false; d = d.AddDate(0, 0, 1) { currencyHistory, err := getCurrencyHistory(ctx, from, to, d.Format("2006-01-02")) if err != nil { return nil, err } currencyHistories[idx] = currencyHistory idx++ } return currencyHistories, nil }
v1/currency/history?from=IDR&to=JPY&start_date=2022-03-01&end_date=2022-03-10
Mac Mini (M1,200) Chip Apple M1 16 GB memory macOS Monterey version 12.3 Internet speed 42.53 (Download) 15.34 (Upload) 29ms (Ping) Internet region: Indonesia

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
iWeb Scraping Services

iWeb Scraping Services

Web Scraping services with iWeb Scraping Company is best Data scraping services provider in the USA, India, Australia, UAE, UK, and more countries at affordable