summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWayne-Cole <77279425+Wacky404@users.noreply.github.com>2025-05-18 16:14:04 -0500
committerWayne-Cole <77279425+Wacky404@users.noreply.github.com>2025-05-18 16:14:04 -0500
commit291b2680ce39f1bf8224520742c19935d85cdd54 (patch)
treea9761ae9b9c71805808f5d9a8a45ab89caf184b4
parentf9bdc212b29ef30b9dcdf1e78bab4f19f7e204a7 (diff)
downloadlurchers-291b2680ce39f1bf8224520742c19935d85cdd54.tar.xz
lurchers-291b2680ce39f1bf8224520742c19935d85cdd54.zip
update: configured db connection and functions + minor changes to files
-rw-r--r--.gitignore7
-rw-r--r--cmd/lurchers/main.go39
-rw-r--r--data/configs/book.go78
-rw-r--r--data/configs/indeed.go20
-rw-r--r--data/jobs.go32
-rw-r--r--db/connect.go61
-rw-r--r--db/sql.go45
-rw-r--r--evade/agent.go4
-rw-r--r--evade/proxy.go4
-rw-r--r--go.mod10
-rw-r--r--go.sum10
-rw-r--r--handler/err.go9
-rw-r--r--handler/img.go10
-rw-r--r--util/env.go12
-rw-r--r--util/logger.go99
15 files changed, 420 insertions, 20 deletions
diff --git a/.gitignore b/.gitignore
index 6f72f89..ca8fb85 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@
*.test
# Output of the go coverage tool, specifically when used with LiteIDE
+**/bin/
*.out
# Dependency directories (remove the comment below to include it)
@@ -23,3 +24,9 @@ go.work.sum
# env file
.env
+
+# macos things
+.DS_Store
+
+#logs
+logs/
diff --git a/cmd/lurchers/main.go b/cmd/lurchers/main.go
index f6175b0..2a127f8 100644
--- a/cmd/lurchers/main.go
+++ b/cmd/lurchers/main.go
@@ -1,22 +1,37 @@
package main
import (
- "github.com/Wacky404/lurchers/data"
+ "context"
+ "log"
+ "log/slog"
+ "time"
+
+ "github.com/Wacky404/lurchers/util"
"github.com/gocolly/colly"
+ "github.com/joho/godotenv"
)
func main() {
- bookShelve := []data.Book{}
- c := colly.NewCollector()
- c.OnHTML("div[class]", func(e *colly.HTMLElement) {
- className := e.Attr("class")
- if className == "product" {
- b := new(data.Book)
- }
- })
+ logFile, err := util.SetupLogger(util.WithLogName("logs/lurchers.log"))
+ if err != nil {
+ log.Fatal("error setting up logger", err)
+ }
+ defer logFile.Close()
+
+ // this time out value will change
+ ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*500)
+ defer cancel()
+
+ err = godotenv.Load()
+ if err != nil {
+ slog.Error("error loading .env file", slog.Any("error", err))
+ }
- c.OnHTML("a[href]", func(e *colly.HTMLElement) {
- link := e.Attr("href")
- c.Visit(e.Request.AbsoluteURL(link))
+ // before making a request print "Visiting..."
+ c.OnRequest(func(r *colly.Request) {
+ slog.Info("Visiting", slog.String("Request URL", r.URL.String()))
})
+ // start scraping on website(s)
+ c.Visit("https://store.crunchyroll.com/collections/manga-books/?srule=Most-Popular")
+ c.Wait()
}
diff --git a/data/configs/book.go b/data/configs/book.go
index 27459e6..7f70f9c 100644
--- a/data/configs/book.go
+++ b/data/configs/book.go
@@ -1 +1,79 @@
+/*
+* TODO: Pagination works for Crunchyroll store for manga-books but
+* Price selector is not working. Need to fix. Then persisting storage
+ */
package configs
+
+import (
+ "context"
+ "log"
+ "log/slog"
+ "time"
+
+ "github.com/Wacky404/lurchers/evade"
+ "github.com/Wacky404/lurchers/util"
+ "github.com/gocolly/colly"
+ "github.com/joho/godotenv"
+)
+
+func main() {
+ logFile, err := util.SetupLogger(util.WithLogName("logs/lurchers.log"))
+ if err != nil {
+ log.Fatal("error setting up logger", err)
+ }
+ defer logFile.Close()
+
+ // this time out value will change
+ ctx, cancel := context.WithTimeout(context.Background(), time.Millisecond*500)
+ defer cancel()
+
+ err = godotenv.Load()
+ if err != nil {
+ slog.Error("error loading .env file", slog.Any("error", err))
+ }
+
+ // bookShelve := []data.Book{}
+ c := colly.NewCollector()
+ evade.NewUserAgent(ctx, c)
+ proxies := []string{util.GetVar("TOR", "socks5://127.0.0.1:9050")}
+ err = evade.RotateProxy(c, &proxies)
+ if err != nil {
+ slog.Error("error configuring the RotateProxy", slog.Any("error", err))
+ }
+
+ c.OnHTML("div[class]", func(e *colly.HTMLElement) {
+ className := e.Attr("class")
+ pricemsrp := e.Attr("content")
+ if className == "product" {
+ // Replace multiple newlines with a single space
+ // cleaned := strings.ReplaceAll(e.DOM.Text(), "\n", " ")
+ // remove multiple spaces
+ // cleaned = strings.Join(strings.Fields(cleaned), " ")
+ title := e.ChildText(".pdp-link")
+ priceDiscout := e.ChildText(".sales")
+ slog.Info("Product Found", slog.String("Title", title), slog.String("Price", pricemsrp), slog.String("Sale", priceDiscout))
+ }
+ })
+
+ c.OnHTML("a.image-tile-container", func(e *colly.HTMLElement) {
+ srcLink := e.ChildText("src")
+ slog.Debug(srcLink)
+ })
+
+ c.OnHTML("div.pagination-block ul li a", func(e *colly.HTMLElement) {
+ rightArrow := e.Attr("class")
+ if rightArrow == "right-arrow" {
+ link := e.Attr("href")
+ c.Visit(e.Request.AbsoluteURL(link))
+ }
+ })
+
+ // before making a request print "Visiting..."
+ c.OnRequest(func(r *colly.Request) {
+ slog.Info("Visiting", slog.String("Request URL", r.URL.String()))
+ })
+
+ // start scraping on crunchyroll
+ c.Visit("https://store.crunchyroll.com/collections/manga-books/?srule=Most-Popular")
+ c.Wait()
+}
diff --git a/data/configs/indeed.go b/data/configs/indeed.go
new file mode 100644
index 0000000..3fad0fd
--- /dev/null
+++ b/data/configs/indeed.go
@@ -0,0 +1,20 @@
+package configs
+
+import (
+ "github.com/Wacky404/lurchers/data"
+ "github.com/gocolly/colly"
+)
+
+func indeedConfig() *colly.Collector {
+ job := new(data.Job)
+ c := colly.NewCollector()
+ c.OnHTML("a[id^='job_']", func(e *colly.HTMLElement) {
+ link := e.Attr("href")
+ c.Visit(e.Request.AbsoluteURL(link))
+ })
+ c.OnHTML("h1[class^='jobserch-JobInfoHeader-title']", func(e *colly.HTMLElement) {
+
+ })
+
+ return c
+}
diff --git a/data/jobs.go b/data/jobs.go
new file mode 100644
index 0000000..c47d142
--- /dev/null
+++ b/data/jobs.go
@@ -0,0 +1,32 @@
+package data
+
+import (
+ "encoding/json"
+ "time"
+)
+
+type Job struct {
+ posting JobPosting
+ details JobDetails
+}
+
+type JobPosting struct {
+ website string
+ url string
+ location string
+ company string
+ position string
+ jobType string
+ workShift string
+ workSetting string
+ lastModified time.Time
+}
+
+type JobDetails struct {
+ skills json.Marshaler
+ licenses json.Marshaler
+ certs json.Marshaler
+ education json.Marshaler
+ benefits json.Marshaler
+ fullJobDescription string
+}
diff --git a/db/connect.go b/db/connect.go
new file mode 100644
index 0000000..2e7123d
--- /dev/null
+++ b/db/connect.go
@@ -0,0 +1,61 @@
+package db
+
+import (
+ "context"
+ "database/sql"
+ "fmt"
+ "log/slog"
+ "strconv"
+
+ dbSql "github.com/Wacky404/lurchers/db"
+ "github.com/Wacky404/lurchers/util"
+)
+
+type Database struct {
+ conn *sql.DB
+ ctx *context.Context
+ host string
+ port int64
+ user string
+ password string
+ name string
+ sql *dbSql.DbStatements
+}
+
+// creates an instance of the Database struct and loads in env vars
+func LoadConfig(ctx *context.Context) (*Database, error) {
+ port, err := strconv.ParseInt(util.GetVar("DB_PORT", ""), 10, 64)
+ if err != nil {
+ slog.Error("error loading .env var port", slog.Any("error", err))
+ return nil, err
+ }
+ db := &Database{
+ ctx: ctx,
+ host: util.GetVar("DB_HOST", ""),
+ port: port,
+ user: util.GetVar("DB_USER", ""),
+ password: util.GetVar("DB_PASSWORD", ""),
+ name: util.GetVar("DB_NAME", ""),
+ sql: dbSql.NewDbStatements(),
+ }
+
+ return db, nil
+}
+
+// connecting to the precious
+func (d *Database) Connect() error {
+ psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+
+ "password=%s dbname=%s sslmode=disable",
+ d.host, d.port, d.user, d.password, d.name)
+ conn, err := sql.Open("postgres", psqlInfo)
+ if err != nil {
+ slog.Error("error connecting to the database", slog.Any("error", err))
+ }
+
+ if err := conn.Ping(); err != nil {
+ return err
+ }
+
+ d.conn = conn
+ return nil
+}
diff --git a/db/sql.go b/db/sql.go
new file mode 100644
index 0000000..4a9acfb
--- /dev/null
+++ b/db/sql.go
@@ -0,0 +1,45 @@
+package db
+
+/*
+sql statements to be run against the specified Database
+in connect.go
+*/
+
+type JobPosting struct {
+ addRow string
+}
+
+func NewJobPosting() *JobPosting {
+ return &JobPosting{addRow: addRowJobPosting}
+}
+
+type JobDetails struct {
+ addRow string
+}
+
+func NewJobDetails() *JobDetails {
+ return &JobDetails{addRow: addRowJobDetails}
+}
+
+type DbStatements struct {
+ JobPosting *JobPosting
+ JobDetails *JobDetails
+}
+
+func NewDbStatements() *DbStatements {
+ return &DbStatements{JobPosting: NewJobPosting(), JobDetails: NewJobDetails()}
+}
+
+// table job_posting; add row
+// return a job_id; to be used as FK
+var addRowJobPosting string = `
+ INSERT INTO job_posting (job_type, website, url, location, company, position, work_shift, work_setting, date_added, last_updated)
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
+ RETURNING job_id
+ `
+
+// table job_details; add row
+var addRowJobDetails string = `
+ INSERT INTO job_details (job_id, skills, licences, certifications, education, benefits, full_job_description)
+ VALUES ($1, $2, $3, $4, $5, $6, $7)
+ `
diff --git a/evade/agent.go b/evade/agent.go
index f5da98f..fae8bcc 100644
--- a/evade/agent.go
+++ b/evade/agent.go
@@ -1,11 +1,13 @@
package evade
import (
+ "context"
+
"github.com/gocolly/colly"
"github.com/gocolly/colly/extensions"
)
-func NewUserAgent(c *colly.Collector) {
+func NewUserAgent(ctx context.Context, c *colly.Collector) {
extensions.RandomUserAgent(c)
}
diff --git a/evade/proxy.go b/evade/proxy.go
index dbe1cec..a388960 100644
--- a/evade/proxy.go
+++ b/evade/proxy.go
@@ -1,7 +1,7 @@
package evade
import (
- "errors"
+ "fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/proxy"
@@ -10,7 +10,7 @@ import (
func RotateProxy(c *colly.Collector, proxies *[]string) error {
rp, err := proxy.RoundRobinProxySwitcher(*proxies...)
if err != nil {
- return errors.New("proxies were not set for collector")
+ return fmt.Errorf("error setting up proxy switcher: %s", err.Error())
}
c.SetProxyFunc(rp)
diff --git a/go.mod b/go.mod
index f51876d..42f4399 100644
--- a/go.mod
+++ b/go.mod
@@ -3,21 +3,27 @@ module github.com/Wacky404/lurchers
go 1.23.4
require (
+ github.com/gocolly/colly v1.2.0
+ github.com/joho/godotenv v1.5.1
+)
+
+require (
github.com/PuerkitoBio/goquery v1.10.1 // indirect
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/antchfx/htmlquery v1.3.4 // indirect
github.com/antchfx/xmlquery v1.4.3 // indirect
github.com/antchfx/xpath v1.3.3 // indirect
+ github.com/davecgh/go-spew v1.1.1 // indirect
github.com/gobwas/glob v0.2.3 // indirect
- github.com/gocolly/colly v1.2.0 // indirect
github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect
github.com/golang/protobuf v1.5.4 // indirect
- github.com/jawher/mow.cli v1.2.0 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
+ github.com/stretchr/testify v1.4.0 // indirect
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.34.0 // indirect
golang.org/x/text v0.21.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/protobuf v1.36.3 // indirect
+ gopkg.in/yaml.v2 v2.2.5 // indirect
)
diff --git a/go.sum b/go.sum
index 248c8fe..b60b753 100644
--- a/go.sum
+++ b/go.sum
@@ -9,6 +9,7 @@ github.com/antchfx/xmlquery v1.4.3/go.mod h1:AEPEEPYE9GnA2mj5Ur2L5Q5/2PycJ0N9Fus
github.com/antchfx/xpath v1.3.3 h1:tmuPQa1Uye0Ym1Zn65vxPgfltWb/Lxu2jeqIGteJSRs=
github.com/antchfx/xpath v1.3.3/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y=
github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8=
@@ -22,17 +23,19 @@ github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiu
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/jawher/mow.cli v1.2.0 h1:e6ViPPy+82A/NFF/cfbq3Lr6q4JHKT9tyHwTCcUQgQw=
-github.com/jawher/mow.cli v1.2.0/go.mod h1:y+pcA3jBAdo/GIZx/0rFjw/K2bVEODP9rfZOfaiq8Ko=
+github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
+github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
github.com/kennygrant/sanitize v1.2.4 h1:gN25/otpP5vAsO2djbMhF/LQX6R7+O1TB4yv8NzpJ3o=
github.com/kennygrant/sanitize v1.2.4/go.mod h1:LGsjYYtgxbetdg5owWB2mpgUL6e2nfw2eObZ0u0qvak=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d h1:hrujxIzL1woJ7AwssoOcM/tq5JjjG2yYOc8odClEiXA=
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d/go.mod h1:uugorj2VCxiV1x+LzaIdVa9b4S4qGAcH6cbhh4qVxOU=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
@@ -113,4 +116,5 @@ google.golang.org/protobuf v1.36.3 h1:82DV7MYdb8anAVi3qge1wSnMDrnKK7ebr+I0hHRN1B
google.golang.org/protobuf v1.36.3/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.5 h1:ymVxjfMaHvXD8RqPRmzHHsB3VvucivSkIAvJFDI5O3c=
gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
diff --git a/handler/err.go b/handler/err.go
new file mode 100644
index 0000000..8160a2b
--- /dev/null
+++ b/handler/err.go
@@ -0,0 +1,9 @@
+package handler
+
+import (
+ "fmt"
+ "errors"
+ "github.com/gocolly/colly"
+)
+
+
diff --git a/handler/img.go b/handler/img.go
new file mode 100644
index 0000000..858e52f
--- /dev/null
+++ b/handler/img.go
@@ -0,0 +1,10 @@
+package handler
+
+import (
+ "log"
+)
+
+func EventImg(event string, callback func(string)) {
+ log.Printf()
+ callback(event)
+}
diff --git a/util/env.go b/util/env.go
new file mode 100644
index 0000000..8c59052
--- /dev/null
+++ b/util/env.go
@@ -0,0 +1,12 @@
+package util
+
+import "os"
+
+func GetVar(key string, fallback string) string {
+ val, ok := os.LookupEnv(key)
+ if !ok {
+ return fallback
+ }
+
+ return val
+}
diff --git a/util/logger.go b/util/logger.go
new file mode 100644
index 0000000..94d0563
--- /dev/null
+++ b/util/logger.go
@@ -0,0 +1,99 @@
+package util
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+ "os"
+)
+
+type MultiHandler struct {
+ handlers []slog.Handler
+}
+
+func NewMultiHandler(handlers ...slog.Handler) *MultiHandler {
+ return &MultiHandler{handlers: handlers}
+}
+
+func (m *MultiHandler) Enabled(ctx context.Context, level slog.Level) bool {
+ for _, h := range m.handlers {
+ if h.Enabled(ctx, level) {
+ return true
+ }
+ }
+ return false
+}
+
+func (m *MultiHandler) Handle(ctx context.Context, record slog.Record) error {
+ for _, h := range m.handlers {
+ _ = h.Handle(ctx, record) // Process each handler, ignoring errors
+ }
+ return nil
+}
+
+func (m *MultiHandler) WithAttrs(attrs []slog.Attr) slog.Handler {
+ newHandlers := make([]slog.Handler, len(m.handlers))
+ for i, h := range m.handlers {
+ newHandlers[i] = h.WithAttrs(attrs)
+ }
+ return &MultiHandler{handlers: newHandlers}
+}
+
+func (m *MultiHandler) WithGroup(name string) slog.Handler {
+ newHandlers := make([]slog.Handler, len(m.handlers))
+ for i, h := range m.handlers {
+ newHandlers[i] = h.WithGroup(name)
+ }
+ return &MultiHandler{handlers: newHandlers}
+}
+
+type options struct {
+ logName string
+ stdoutLevel slog.Level
+ fileLevel slog.Level
+}
+
+func SetupLogger(opts ...func(*options)) (*os.File, error) {
+ o := options{
+ logName: "logs/Default.log",
+ stdoutLevel: slog.LevelInfo,
+ fileLevel: slog.LevelDebug,
+ }
+
+ for _, opt := range opts {
+ opt(&o)
+ }
+
+ logFile, err := os.OpenFile(o.logName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o666)
+ if err != nil {
+ return nil, fmt.Errorf("no file opened for logs: %v", err)
+ }
+
+ stdoutHandler := slog.NewTextHandler(os.Stdout,
+ &slog.HandlerOptions{Level: o.stdoutLevel})
+ fileHandler := slog.NewJSONHandler(logFile,
+ &slog.HandlerOptions{Level: o.fileLevel})
+
+ logger := slog.New(NewMultiHandler(stdoutHandler, fileHandler))
+ slog.SetDefault(logger)
+
+ return logFile, err
+}
+
+func WithLogName(name string) func(*options) {
+ return func(o *options) {
+ o.logName = name
+ }
+}
+
+func WithStdoutLevel(level slog.Level) func(*options) {
+ return func(o *options) {
+ o.stdoutLevel = level
+ }
+}
+
+func WithFileLevel(level slog.Level) func(*options) {
+ return func(o *options) {
+ o.fileLevel = level
+ }
+}