scrapify

package module
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 2, 2024 License: MIT Imports: 8 Imported by: 0

README

Scrapify🚀: A Go Library for Web Scraping

This library provides tools for building web scrapers in Go. It allows you to create custom HTTP requests with advanced features for bypassing basic anti-scraping measures.

Features

  • Customizable Headers: Set various headers like User-Agent and sec-ch-ua to mimic a real browser.
  • TLS Configuration: Customize the Transport Layer Security (TLS) configuration for secure connections.
  • Browser Emulation: Specify different browsers (Chrome, Firefox, Edge) to influence the cipher suites offered.
  • Default Headers: Includes a set of common headers to improve compatibility.

Installation

  go get -u github.com/iarsham/scrapify@latest

Usage

package main

import (
	"fmt"
	"github.com/iarsham/scrapify"
	"net/http"
)

func main() {
	client := &http.Client{
		Transport: scrapify.NewTransport(scrapify.Chrome),
	}
	req, err := http.NewRequest(http.MethodGet, "https://cloudflare.com", nil)
	if err != nil {
		panic(err)
	}
	scrapify.SetHeaders(req, nil)
	resp, err := client.Do(req)
	if err != nil {
		panic(err)
	}
	defer resp.Body.Close()
	fmt.Println(resp.StatusCode)
}
package main

import (
	"fmt"
	"github.com/iarsham/scrapify"
)

func main() {
	c := colly.NewCollector()
	c.WithTransport(scrapify.NewTransport(scrapify.Chrome))

	c.OnRequest(func(r *colly.Request) {
		scrapify.SetCollyHeaders(r, nil)
	})

	c.OnHTML("body", func(e *colly.HTMLElement) {
		fmt.Println(e.Text)
	})

	c.OnResponse(func(r *colly.Response) {
		fmt.Println(r.StatusCode)
	})

	if err := c.Visit("https://chatgpt.com"); err != nil {
		panic(err)
	}
}

Contributing

We welcome contributions to this library. Please feel free to submit pull requests with improvements and bug fixes.

License

This library is licensed under the MIT License. See the LICENSE file for details.

Documentation

Index

Constants

This section is empty.

Variables

View Source
var MapBrowserToCipher = map[Browser][]uint16{
	Chrome: {
		0xbaba,
		tls.TLS_AES_128_GCM_SHA256,
		tls.TLS_AES_256_GCM_SHA384,
		tls.TLS_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
		tls.TLS_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_RSA_WITH_AES_256_CBC_SHA,
	},
	Firefox: {
		tls.TLS_AES_128_GCM_SHA256,
		tls.TLS_CHACHA20_POLY1305_SHA256,
		tls.TLS_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA,
		tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA,
		tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
		tls.TLS_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_RSA_WITH_AES_256_CBC_SHA,
	},
	Edge: {
		0x1a1a,
		tls.TLS_AES_128_GCM_SHA256,
		tls.TLS_AES_256_GCM_SHA384,
		tls.TLS_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256,
		tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA,
		tls.TLS_RSA_WITH_AES_128_GCM_SHA256,
		tls.TLS_RSA_WITH_AES_256_GCM_SHA384,
		tls.TLS_RSA_WITH_AES_128_CBC_SHA,
		tls.TLS_RSA_WITH_AES_256_CBC_SHA,
	},
}

Functions

func NewTransport

func NewTransport(browser Browser) http.RoundTripper

func ReadAll added in v0.6.0

func ReadAll(resp *http.Response) ([]byte, error)

func SetCollyHeaders added in v0.4.0

func SetCollyHeaders(req *colly.Request, headers M)

func SetHeaders added in v0.2.0

func SetHeaders(req *http.Request, headers M)

Types

type Browser

type Browser int
const (
	Chrome Browser = iota
	Firefox
	Edge
)

type M added in v0.2.0

type M map[string]string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL