diff options
| author | Wayne-Cole <77279425+Wacky404@users.noreply.github.com> | 2025-05-18 16:14:04 -0500 |
|---|---|---|
| committer | Wayne-Cole <77279425+Wacky404@users.noreply.github.com> | 2025-05-18 16:14:04 -0500 |
| commit | 291b2680ce39f1bf8224520742c19935d85cdd54 (patch) | |
| tree | a9761ae9b9c71805808f5d9a8a45ab89caf184b4 | |
| parent | f9bdc212b29ef30b9dcdf1e78bab4f19f7e204a7 (diff) | |
| download | lurchers-291b2680ce39f1bf8224520742c19935d85cdd54.tar.xz lurchers-291b2680ce39f1bf8224520742c19935d85cdd54.zip | |
update: configured db connection and functions + minor changes to files
| -rw-r--r-- | .gitignore | 7 | ||||
| -rw-r--r-- | cmd/lurchers/main.go | 39 | ||||
| -rw-r--r-- | data/configs/book.go | 78 | ||||
| -rw-r--r-- | data/configs/indeed.go | 20 | ||||
| -rw-r--r-- | data/jobs.go | 32 | ||||
| -rw-r--r-- | db/connect.go | 61 | ||||
| -rw-r--r-- | db/sql.go | 45 | ||||
| -rw-r--r-- | evade/agent.go | 4 | ||||
| -rw-r--r-- | evade/proxy.go | 4 | ||||
| -rw-r--r-- | go.mod | 10 | ||||
| -rw-r--r-- | go.sum | 10 | ||||
| -rw-r--r-- | handler/err.go | 9 | ||||
| -rw-r--r-- | handler/img.go | 10 | ||||
| -rw-r--r-- | util/env.go | 12 | ||||
| -rw-r--r-- | util/logger.go | 99 |
15 files changed, 420 insertions, 20 deletions
@@ -12,6 +12,7 @@ *.test # Output of the go coverage tool, specifically when used with LiteIDE +**/bin/ *.out # Dependency directories (remove the comment below to include it) @@ -23,3 +24,9 @@ go.work.sum # env file .env + +# macos things +.DS_Store + +#logs +logs/ diff --git a/cmd/lurchers/main.go b/cmd/lurchers/main.go index f6175b0..2a127f8 100644 --- a/cmd/lurchers/main.go +++ b/cmd/lurchers/main.go @@ -1,22 +1,37 @@ package main import ( - "github.com/Wacky404/lurchers/data" + "context" + "log" + "log/slog" + "time" + + "github.com/Wacky404/lurchers/util" "github.com/gocolly/colly" + "github.com/joho/godotenv" ) func main() { - bookShelve := []data.Book{} - c := colly.NewCollector() - c.OnHTML("div[class]", func(e *colly.HTMLElement) { - className := e.Attr("class") - if className == "product" { - b := new(data.Book) - } - }) + logFile, err := util.SetupLogger(util.WithLogName("logs/lurchers.log")) + if err != nil { + log.Fatal("error setting up logger", err) + } + defer logFile.Close() + + // this time out value will change + ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*500) + defer cancel() + + err = godotenv.Load() + if err != nil { + slog.Error("error loading .env file", slog.Any("error", err)) + } - c.OnHTML("a[href]", func(e *colly.HTMLElement) { - link := e.Attr("href") - c.Visit(e.Request.AbsoluteURL(link)) + // before making a request print "Visiting..." + c.OnRequest(func(r *colly.Request) { + slog.Info("Visiting", slog.String("Request URL", r.URL.String())) }) + // start scraping on website(s) + c.Visit("https://store.crunchyroll.com/collections/manga-books/?srule=Most-Popular") + c.Wait() } diff --git a/data/configs/book.go b/data/configs/book.go index 27459e6..7f70f9c 100644 --- a/data/configs/book.go +++ b/data/configs/book.go @@ -1 +1,79 @@ +/* +* TODO: Pagination works for Crunchyroll store for manga-books but +* Price selector is not working. Need to fix. Then persisting storage + */ package configs + +import ( + "context" + "log" + "log/slog" + "time" + + "github.com/Wacky404/lurchers/evade" + "github.com/Wacky404/lurchers/util" + "github.com/gocolly/colly" + "github.com/joho/godotenv" +) + +func main() { + logFile, err := util.SetupLogger(util.WithLogName("logs/lurchers.log")) + if err != nil { + log.Fatal("error setting up logger", err) + } + defer logFile.Close() + + // this time out value will change + ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*500) + defer cancel() + + err = godotenv.Load() + if err != nil { + slog.Error("error loading .env file", slog.Any("error", err)) + } + + // bookShelve := []data.Book{} + c := colly.NewCollector() + evade.NewUserAgent(ctx, c) + proxies := []string{util.GetVar("TOR", "socks5://127.0.0.1:9050")} + err = evade.RotateProxy(c, &proxies) + if err != nil { + slog.Error("error configuring the RotateProxy", slog.Any("error", err)) + } + + c.OnHTML("div[class]", func(e *colly.HTMLElement) { + className := e.Attr("class") + pricemsrp := e.Attr("content") + if className == "product" { + // Replace multiple newlines with a single space + // cleaned := strings.ReplaceAll(e.DOM.Text(), "\n", " ") + // remove multiple spaces + // cleaned = strings.Join(strings.Fields(cleaned), " ") + title := e.ChildText(".pdp-link") + priceDiscout := e.ChildText(".sales") + slog.Info("Product Found", slog.String("Title", title), slog.String("Price", pricemsrp), slog.String("Sale", priceDiscout)) + } + }) + + c.OnHTML("a.image-tile-container", func(e *colly.HTMLElement) { + srcLink := e.ChildText("src") + slog.Debug(srcLink) + }) + + c.OnHTML("div.pagination-block ul li a", func(e *colly.HTMLElement) { + rightArrow := e.Attr("class") + if rightArrow == "right-arrow" { + link := e.Attr("href") + c.Visit(e.Request.AbsoluteURL(link)) + } + }) + + // before making a request print "Visiting..." + c.OnRequest(func(r *colly.Request) { + slog.Info("Visiting", slog.String("Request URL", r.URL.String())) + }) + + // start scraping on crunchyroll + c.Visit("https://store.crunchyroll.com/collections/manga-books/?srule=Most-Popular") + c.Wait() +} diff --git a/data/configs/indeed.go b/data/configs/indeed.go new file mode 100644 index 0000000..3fad0fd --- /dev/null +++ b/data/configs/indeed.go @@ -0,0 +1,20 @@ +package configs + +import ( + "github.com/Wacky404/lurchers/data" + "github.com/gocolly/colly" +) + +func indeedConfig() *colly.Collector { + job := new(data.Job) + c := colly.NewCollector() + c.OnHTML("a[id^='job_']", func(e *colly.HTMLElement) { + link := e.Attr("href") + c.Visit(e.Request.AbsoluteURL(link)) + }) + c.OnHTML("h1[class^='jobserch-JobInfoHeader-title']", func(e *colly.HTMLElement) { + + }) + + return c +} diff --git a/data/jobs.go b/data/jobs.go new file mode 100644 index 0000000..c47d142 --- /dev/null +++ b/data/jobs.go @@ -0,0 +1,32 @@ +package data + +import ( + "encoding/json" + "time" +) + +type Job struct { + posting JobPosting + details JobDetails +} + +type JobPosting struct { + website string + url string + location string + company string + position string + jobType string + workShift string + workSetting string + lastModified time.Time +} + +type JobDetails struct { + skills json.Marshaler + licenses json.Marshaler + certs json.Marshaler + education json.Marshaler + benefits json.Marshaler + fullJobDescription string +} diff --git a/db/connect.go b/db/connect.go new file mode 100644 index 0000000..2e7123d --- /dev/null +++ b/db/connect.go @@ -0,0 +1,61 @@ +package db + +import ( + "context" + "database/sql" + "fmt" + "log/slog" + "strconv" + + dbSql "github.com/Wacky404/lurchers/db" + "github.com/Wacky404/lurchers/util" +) + +type Database struct { + conn *sql.DB + ctx *context.Context + host string + port int64 + user string + password string + name string + sql *dbSql.DbStatements +} + +// creates an instance of the Database struct and loads in env vars +func LoadConfig(ctx *context.Context) (*Database, error) { + port, err := strconv.ParseInt(util.GetVar("DB_PORT", ""), 10, 64) + if err != nil { + slog.Error("error loading .env var port", slog.Any("error", err)) + return nil, err + } + db := &Database{ + ctx: ctx, + host: util.GetVar("DB_HOST", ""), + port: port, + user: util.GetVar("DB_USER", ""), + password: util.GetVar("DB_PASSWORD", ""), + name: util.GetVar("DB_NAME", ""), + sql: dbSql.NewDbStatements(), + } + + return db, nil +} + +// connecting to the precious +func (d *Database) Connect() error { + psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+ + "password=%s dbname=%s sslmode=disable", + d.host, d.port, d.user, d.password, d.name) + conn, err := sql.Open("postgres", psqlInfo) + if err != nil { + slog.Error("error connecting to the database", slog.Any("error", err)) + } + + if err := conn.Ping(); err != nil { + return err + } + + d.conn = conn + return nil +} diff --git a/db/sql.go b/db/sql.go new file mode 100644 index 0000000..4a9acfb --- /dev/null +++ b/db/sql.go @@ -0,0 +1,45 @@ +package db + +/* +sql statements to be run against the specified Database +in connect.go +*/ + +type JobPosting struct { + addRow string +} + +func NewJobPosting() *JobPosting { + return &JobPosting{addRow: addRowJobPosting} +} + +type JobDetails struct { + addRow string +} + +func NewJobDetails() *JobDetails { + return &JobDetails{addRow: addRowJobDetails} +} + +type DbStatements struct { + JobPosting *JobPosting + JobDetails *JobDetails +} + +func NewDbStatements() *DbStatements { + return &DbStatements{JobPosting: NewJobPosting(), JobDetails: NewJobDetails()} +} + +// table job_posting; add row +// return a job_id; to be used as FK +var addRowJobPosting string = ` + INSERT INTO job_posting (job_type, website, url, location, company, position, work_shift, work_setting, date_added, last_updated) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10) + RETURNING job_id + ` + +// table job_details; add row +var addRowJobDetails string = ` + INSERT INTO job_details (job_id, skills, licences, certifications, education, benefits, full_job_description) + VALUES ($1, $2, $3, $4, $5, $6, $7) + ` diff --git a/evade/agent.go b/evade/agent.go index f5da98f..fae8bcc 100644 --- a/evade/agent.go +++ b/evade/agent.go @@ -1,11 +1,13 @@ package evade import ( + "context" + "github.com/gocolly/colly" "github.com/gocolly/colly/extensions" ) -func NewUserAgent(c *colly.Collector) { +func NewUserAgent(ctx context.Context, c *colly.Collector) { extensions.RandomUserAgent(c) } diff --git a/evade/proxy.go b/evade/proxy.go index dbe1cec..a388960 100644 --- a/evade/proxy.go +++ b/evade/proxy.go @@ -1,7 +1,7 @@ package evade import ( - "errors" + "fmt" "github.com/gocolly/colly" "github.com/gocolly/colly/proxy" @@ -10,7 +10,7 @@ import ( func RotateProxy(c *colly.Collector, proxies *[]string) error { rp, err := proxy.RoundRobinProxySwitcher(*proxies...) if err != nil { - return errors.New("proxies were not set for collector") + return fmt.Errorf("error setting up proxy switcher: %s", err.Error()) } c.SetProxyFunc(rp) @@ -3,21 +3,27 @@ module github.com/Wacky404/lurchers go 1.23.4 require ( + github.com/gocolly/colly v1.2.0 + github.com/joho/godotenv v1.5.1 +) + +require ( github.com/PuerkitoBio/goquery v1.10.1 // indirect github.com/andybalholm/cascadia v1.3.3 // indirect github.com/antchfx/htmlquery v1.3.4 // indirect github.com/antchfx/xmlquery v1.4.3 // indirect github.com/antchfx/xpath v1.3.3 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect github.com/gobwas/glob v0.2.3 // indirect - github.com/gocolly/colly v1.2.0 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/protobuf v1.5.4 // indirect - github.com/jawher/mow.cli v1.2.0 // indirect github.com/kennygrant/sanitize v1.2.4 // indirect github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect + github.com/stretchr/testify v1.4.0 // indirect github.com/temoto/robotstxt v1.1.2 // indirect golang.org/x/net v0.34.0 // indirect golang.org/x/text v0.21.0 // indirect google.golang.org/appengine v1.6.8 // indirect google.golang.org/protobuf v1.36.3 // indirect + gopkg.in/yaml.v2 v2.2.5 // indirect ) @@ -9,6 +9,7 @@ github.com/antchfx/xmlquery v1.4.3/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fus github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs= github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= @@ -22,17 +23,19 @@ github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/jawher/mow.cli v1.2.0 h1:e6ViPPy+82A/NFF/cfbq3Lr6q4JHKT9tyHwTCcUQgQw= -github.com/jawher/mow.cli v1.2.0/go.mod h1:y+pcA3jBAdo/GIZx/0rFjw/K2bVEODP9rfZOfaiq8Ko= +github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= +github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o= github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA= github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo= @@ -113,4 +116,5 @@ google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1B google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.5 h1:ymVxjfMaHvXD8RqPRmzHHsB3VvucivSkIAvJFDI5O3c= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/handler/err.go b/handler/err.go new file mode 100644 index 0000000..8160a2b --- /dev/null +++ b/handler/err.go @@ -0,0 +1,9 @@ +package handler + +import ( + "fmt" + "errors" + "github.com/gocolly/colly" +) + + diff --git a/handler/img.go b/handler/img.go new file mode 100644 index 0000000..858e52f --- /dev/null +++ b/handler/img.go @@ -0,0 +1,10 @@ +package handler + +import ( + "log" +) + +func EventImg(event string, callback func(string)) { + log.Printf() + callback(event) +} diff --git a/util/env.go b/util/env.go new file mode 100644 index 0000000..8c59052 --- /dev/null +++ b/util/env.go @@ -0,0 +1,12 @@ +package util + +import "os" + +func GetVar(key string, fallback string) string { + val, ok := os.LookupEnv(key) + if !ok { + return fallback + } + + return val +} diff --git a/util/logger.go b/util/logger.go new file mode 100644 index 0000000..94d0563 --- /dev/null +++ b/util/logger.go @@ -0,0 +1,99 @@ +package util + +import ( + "context" + "fmt" + "log/slog" + "os" +) + +type MultiHandler struct { + handlers []slog.Handler +} + +func NewMultiHandler(handlers ...slog.Handler) *MultiHandler { + return &MultiHandler{handlers: handlers} +} + +func (m *MultiHandler) Enabled(ctx context.Context, level slog.Level) bool { + for _, h := range m.handlers { + if h.Enabled(ctx, level) { + return true + } + } + return false +} + +func (m *MultiHandler) Handle(ctx context.Context, record slog.Record) error { + for _, h := range m.handlers { + _ = h.Handle(ctx, record) // Process each handler, ignoring errors + } + return nil +} + +func (m *MultiHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + newHandlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + newHandlers[i] = h.WithAttrs(attrs) + } + return &MultiHandler{handlers: newHandlers} +} + +func (m *MultiHandler) WithGroup(name string) slog.Handler { + newHandlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + newHandlers[i] = h.WithGroup(name) + } + return &MultiHandler{handlers: newHandlers} +} + +type options struct { + logName string + stdoutLevel slog.Level + fileLevel slog.Level +} + +func SetupLogger(opts ...func(*options)) (*os.File, error) { + o := options{ + logName: "logs/Default.log", + stdoutLevel: slog.LevelInfo, + fileLevel: slog.LevelDebug, + } + + for _, opt := range opts { + opt(&o) + } + + logFile, err := os.OpenFile(o.logName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o666) + if err != nil { + return nil, fmt.Errorf("no file opened for logs: %v", err) + } + + stdoutHandler := slog.NewTextHandler(os.Stdout, + &slog.HandlerOptions{Level: o.stdoutLevel}) + fileHandler := slog.NewJSONHandler(logFile, + &slog.HandlerOptions{Level: o.fileLevel}) + + logger := slog.New(NewMultiHandler(stdoutHandler, fileHandler)) + slog.SetDefault(logger) + + return logFile, err +} + +func WithLogName(name string) func(*options) { + return func(o *options) { + o.logName = name + } +} + +func WithStdoutLevel(level slog.Level) func(*options) { + return func(o *options) { + o.stdoutLevel = level + } +} + +func WithFileLevel(level slog.Level) func(*options) { + return func(o *options) { + o.fileLevel = level + } +} |
