diff options
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | cmd/lurchers/main.go | 11 | ||||
| -rw-r--r-- | data/configs/indeed.go | 4 | ||||
| -rw-r--r-- | evade/agent.go | 14 | ||||
| -rw-r--r-- | evade/proxy.go | 18 | ||||
| -rw-r--r-- | howlers/.python-version | 1 | ||||
| -rw-r--r-- | howlers/README.md | 0 | ||||
| -rw-r--r-- | howlers/pyproject.toml | 14 | ||||
| -rw-r--r-- | howlers/src/__init__.py | 0 | ||||
| -rw-r--r-- | howlers/src/hello.jac | 3 | ||||
| -rw-r--r-- | internal/db/connect.go (renamed from db/connect.go) | 21 | ||||
| -rw-r--r-- | internal/db/sql.go (renamed from db/sql.go) | 0 | ||||
| -rw-r--r-- | internal/handler/err.go (renamed from handler/err.go) | 0 | ||||
| -rw-r--r-- | internal/handler/img.go (renamed from handler/img.go) | 0 | ||||
| -rw-r--r-- | internal/lcommon.go | 304 | ||||
| -rw-r--r-- | internal/procs/chldspawn.go | 14 | ||||
| -rw-r--r-- | internal/url/addr.go | 32 | ||||
| -rw-r--r-- | urls/addr.go | 30 | ||||
| -rw-r--r-- | util/env.go | 12 | ||||
| -rw-r--r-- | util/logger.go | 99 |
20 files changed, 396 insertions, 184 deletions
@@ -8,6 +8,9 @@ *.so *.dylib +# JacLang things +.jac/ + # Test binary, built with `go test -c` *.test diff --git a/cmd/lurchers/main.go b/cmd/lurchers/main.go index 85db16b..46a56ab 100644 --- a/cmd/lurchers/main.go +++ b/cmd/lurchers/main.go @@ -40,8 +40,19 @@ func main() { // before making a request print "Visiting..." i.C.OnRequest(func(r *colly.Request) { + r.Headers.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") + r.Headers.Set("Accept-Language", "en-US,en;q=0.5") + r.Headers.Set("Connection", "keep-alive") + r.Headers.Set("Upgrade-Insecure-Requests", "1") + slog.Info("Going to website", slog.String("Request URL", r.URL.String())) }) + + // on error + i.C.OnError(func(r *colly.Response, err error) { + slog.Error("This is not working fam", slog.Any("Request URL", r.Request.URL), slog.Any("Response", r), slog.Any("error", err)) + }) + // start scraping on website(s) i.C.Visit(i.Data.Posting.Url) i.C.Wait() diff --git a/data/configs/indeed.go b/data/configs/indeed.go index d1ba4e4..2eb2bbc 100644 --- a/data/configs/indeed.go +++ b/data/configs/indeed.go @@ -34,5 +34,9 @@ func IndeedConfig() *CollyCfg { slog.Info("Job Found", slog.String("Position", jobPosition)) }) + cfg.C.OnResponse(func(r *colly.Response) { + slog.Info(string(r.Body)) + }) + return cfg } diff --git a/evade/agent.go b/evade/agent.go deleted file mode 100644 index fae8bcc..0000000 --- a/evade/agent.go +++ /dev/null @@ -1,14 +0,0 @@ -package evade - -import ( - "context" - - "github.com/gocolly/colly" - "github.com/gocolly/colly/extensions" -) - -func NewUserAgent(ctx context.Context, c *colly.Collector) { - extensions.RandomUserAgent(c) -} - - diff --git a/evade/proxy.go b/evade/proxy.go deleted file mode 100644 index a388960..0000000 --- a/evade/proxy.go +++ /dev/null @@ -1,18 +0,0 @@ -package evade - -import ( - "fmt" - - "github.com/gocolly/colly" - "github.com/gocolly/colly/proxy" -) - -func RotateProxy(c *colly.Collector, proxies *[]string) error { - rp, err := proxy.RoundRobinProxySwitcher(*proxies...) - if err != nil { - return fmt.Errorf("error setting up proxy switcher: %s", err.Error()) - } - c.SetProxyFunc(rp) - - return nil -} diff --git a/howlers/.python-version b/howlers/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/howlers/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/howlers/README.md b/howlers/README.md new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/howlers/README.md diff --git a/howlers/pyproject.toml b/howlers/pyproject.toml new file mode 100644 index 0000000..3e50d19 --- /dev/null +++ b/howlers/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "howlers" +version = "0.0.7" +description = "Configuration for data collection jobs using JacLang." +readme = "README.md" +authors = [ + { name = "Wayne-Cole", email = "77279425+Wacky404@users.noreply.github.com" } +] +requires-python = ">=3.12" +dependencies = [] + +[build-system] +requires = ["uv_build>=0.9.16,<0.10.0"] +build-backend = "uv_build" diff --git a/howlers/src/__init__.py b/howlers/src/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/howlers/src/__init__.py diff --git a/howlers/src/hello.jac b/howlers/src/hello.jac new file mode 100644 index 0000000..2d18e52 --- /dev/null +++ b/howlers/src/hello.jac @@ -0,0 +1,3 @@ +with entry { + print("Hello, World!"); +} diff --git a/db/connect.go b/internal/db/connect.go index 2e7123d..2588e63 100644 --- a/db/connect.go +++ b/internal/db/connect.go @@ -7,8 +7,7 @@ import ( "log/slog" "strconv" - dbSql "github.com/Wacky404/lurchers/db" - "github.com/Wacky404/lurchers/util" + "github.com/Wacky404/lurchers/internal" ) type Database struct { @@ -19,35 +18,35 @@ type Database struct { user string password string name string - sql *dbSql.DbStatements + sql *DbStatements } // creates an instance of the Database struct and loads in env vars func LoadConfig(ctx *context.Context) (*Database, error) { - port, err := strconv.ParseInt(util.GetVar("DB_PORT", ""), 10, 64) + port, err := strconv.ParseInt(internal.GetVar("DB_PORT", ""), 10, 64) if err != nil { slog.Error("error loading .env var port", slog.Any("error", err)) return nil, err } db := &Database{ ctx: ctx, - host: util.GetVar("DB_HOST", ""), + host: internal.GetVar("DB_HOST", ""), port: port, - user: util.GetVar("DB_USER", ""), - password: util.GetVar("DB_PASSWORD", ""), - name: util.GetVar("DB_NAME", ""), - sql: dbSql.NewDbStatements(), + user: internal.GetVar("DB_USER", ""), + password: internal.GetVar("DB_PASSWORD", ""), + name: internal.GetVar("DB_NAME", ""), + sql: NewDbStatements(), } return db, nil } // connecting to the precious -func (d *Database) Connect() error { +func (d *Database) Connect(service string) error { psqlInfo := fmt.Sprintf("host=%s port=%d user=%s "+ "password=%s dbname=%s sslmode=disable", d.host, d.port, d.user, d.password, d.name) - conn, err := sql.Open("postgres", psqlInfo) + conn, err := sql.Open(service, psqlInfo) if err != nil { slog.Error("error connecting to the database", slog.Any("error", err)) } diff --git a/db/sql.go b/internal/db/sql.go index 4a9acfb..4a9acfb 100644 --- a/db/sql.go +++ b/internal/db/sql.go diff --git a/handler/err.go b/internal/handler/err.go index 8160a2b..8160a2b 100644 --- a/handler/err.go +++ b/internal/handler/err.go diff --git a/handler/img.go b/internal/handler/img.go index 858e52f..858e52f 100644 --- a/handler/img.go +++ b/internal/handler/img.go diff --git a/internal/lcommon.go b/internal/lcommon.go new file mode 100644 index 0000000..d9c522f --- /dev/null +++ b/internal/lcommon.go @@ -0,0 +1,304 @@ +/* + * [https://cs.opensource.google/go/x/exp/+/master:mmap/mmap_unix.go] + * for mmap implementation that I borrowed + */ +package internal + +import ( + "context" + "errors" + "fmt" + "io" + "log/slog" + "os" + "runtime" + "syscall" +) + +func GetVar(key string, fallback string) string { + val, ok := os.LookupEnv(key) + if !ok { + return fallback + } + + return val +} + +/* + * ===================================================== + * Lurchers json logger; two handers for file and stdout + * ===================================================== + */ +type MultiHandler struct { + handlers []slog.Handler +} + +func NewMultiHandler(handlers ...slog.Handler) *MultiHandler { + return &MultiHandler{handlers: handlers} +} + +func (m *MultiHandler) Enabled(ctx context.Context, level slog.Level) bool { + for _, h := range m.handlers { + if h.Enabled(ctx, level) { + return true + } + } + return false +} + +func (m *MultiHandler) Handle(ctx context.Context, record slog.Record) error { + for _, h := range m.handlers { + _ = h.Handle(ctx, record) // Process each handler, ignoring errors + } + return nil +} + +func (m *MultiHandler) WithAttrs(attrs []slog.Attr) slog.Handler { + newHandlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + newHandlers[i] = h.WithAttrs(attrs) + } + return &MultiHandler{handlers: newHandlers} +} + +func (m *MultiHandler) WithGroup(name string) slog.Handler { + newHandlers := make([]slog.Handler, len(m.handlers)) + for i, h := range m.handlers { + newHandlers[i] = h.WithGroup(name) + } + return &MultiHandler{handlers: newHandlers} +} + +type options struct { + logName string + stdoutLevel slog.Level + fileLevel slog.Level +} + +func SetupLogger(opts ...func(*options)) (*os.File, error) { + o := options{ + logName: "logs/Default.log", + stdoutLevel: slog.LevelInfo, + fileLevel: slog.LevelDebug, + } + + for _, opt := range opts { + opt(&o) + } + + logFile, err := os.OpenFile(o.logName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o666) + if err != nil { + return nil, fmt.Errorf("no file opened for logs: %v", err) + } + + stdoutHandler := slog.NewTextHandler(os.Stdout, + &slog.HandlerOptions{Level: o.stdoutLevel}) + fileHandler := slog.NewJSONHandler(logFile, + &slog.HandlerOptions{Level: o.fileLevel}) + + logger := slog.New(NewMultiHandler(stdoutHandler, fileHandler)) + slog.SetDefault(logger) + + return logFile, err +} + +func WithLogName(name string) func(*options) { + return func(o *options) { + o.logName = name + } +} + +func WithStdoutLevel(level slog.Level) func(*options) { + return func(o *options) { + o.stdoutLevel = level + } +} + +func WithFileLevel(level slog.Level) func(*options) { + return func(o *options) { + o.fileLevel = level + } +} + +/* + * ======================================================= + * Lurchers Event logger; logs every event from child proc + * using the SPSC model + * ======================================================= + */ +const ( + MAX_EVENTS int = 256 + MAX_CHLD_PROC int = 10000 + SIZE_RING_BUFF int = 16384 + SIZE_FILE int = SIZE_RING_BUFF * MAX_CHLD_PROC // 160 mb + debug bool = false +) + +type EventType int + +const ( + CHLD_PROC_START = iota + CHLD_PROC_DONE + CHLD_PROC_FAILED + CHLD_PROC_HURT + CHLD_PROC_HEALING + CHLD_PROC_HEALED +) + +var eventName = map[EventType]string{ + CHLD_PROC_START: "child_start", + CHLD_PROC_DONE: "child_done", + CHLD_PROC_FAILED: "child_failed", + CHLD_PROC_HURT: "child_hurt", + CHLD_PROC_HEALING: "child_healing", + CHLD_PROC_HEALED: "child_healed", +} + +func (et EventType) String() string { + return eventName[et] +} + +type SysLurchEvent_t struct { + EventTime int + EventKind EventType + EventID int + Data1, Data2 int +} + +type WatchMen struct { + eventHead int + eventTail int + EventQue [MAX_EVENTS]SysLurchEvent_t +} + +// single consumer +func (w *WatchMen) SysGetEvent() *SysLurchEvent_t { + if (*w).eventHead == (*w).eventTail { + return nil // buffer is empty + } + + ev := &(*w).EventQue[(*w).eventTail] + (*w).eventTail = ((*w).eventTail + 1) % MAX_EVENTS + + return ev +} + +// ReaderAt reads a memory-mapped file (.mem) +// Like any io.ReaderAt, clients can execute parallel ReadAt calls, but it is +// not safe to call Close and reading methods concurrently. +type ReaderAt struct { + data []byte +} + +// implements the io.ReaderAt interface +func (r *ReaderAt) ReadAt(p []byte, offset int64) (int, error) { + if r.data == nil { + return 0, errors.New("mmap: closed") + } + if offset < 0 || int64(len(r.data)) < offset { + return 0, fmt.Errorf("mmap: invalid ReadAt offset %d", offset) + } + + n := copy(p, r.data[offset:]) + if n < len(p) { + return n, io.EOF + } + + return n, nil +} + +func (r *ReaderAt) Close() error { + if r.data == nil { + return nil + } else if len(r.data) == 0 { + r.data = nil + return nil + } + + data := r.data + r.data = nil + if debug { + var p *byte + if len(data) != 0 { + p = &data[0] + } + println("munmap", r, p) + } + + runtime.SetFinalizer(r, nil) + + return syscall.Munmap(data) +} + +func Open(filename string) (*ReaderAt, error) { + // just opening a file that will store + // bytes of data that parent proc and + // and child proc(s) share + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer f.Close() + + fs, err := f.Stat() + if err != nil { + return nil, err + } + + // size of the memory file in question + size := fs.Size() + if size == 0 { + // Treat (size == 0) as a special case, truncating the + // file to the specified file size, in the case that the + // file is new; upon first run of program. + f.Truncate(int64(SIZE_FILE)) + } + if size < 0 { + return nil, fmt.Errorf("mmap: file %q has negative size", filename) + } + if size != int64(int(size)) { + return nil, fmt.Errorf("mmap: file %q is too large", filename) + } + + data, err := syscall.Mmap(int(f.Fd()), 0, int(size), syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_SHARED) + if err != nil { + return nil, err + } + r := &ReaderAt{data} + if debug { + var p *byte + if len(data) != 0 { + p = &data[0] + } + println("mmap", r, p) + } + + runtime.SetFinalizer(r, (*ReaderAt).Close) + + return r, nil +} + +/* +func isBufferFull(eventHead int, eventTail int) bool { + return ((eventHead + 1) % MAX_EVENTS) == eventTail +} + +// single producer +func SysQueueEvent(time int, kind EventType, id int, data1 int, data2 int) error { + if IsBufferFull() { + // overflow protection + fmt.Printf("warning: Event queue overflow! Event %s dropped\n", kind.String()) + return fmt.Errorf("warning: Event queue overflow! Event %s dropped", kind.String()) + } + + EventQue[eventHead].EventTime = time + EventQue[eventHead].EventKind = kind + EventQue[eventHead].EventID = id + EventQue[eventHead].Data1 = data1 + EventQue[eventHead].Data2 = data2 + + eventHead = (eventHead + 1) % MAX_EVENTS + + return nil +} +*/ diff --git a/internal/procs/chldspawn.go b/internal/procs/chldspawn.go new file mode 100644 index 0000000..eb7db69 --- /dev/null +++ b/internal/procs/chldspawn.go @@ -0,0 +1,14 @@ +package procs + +import ( + "context" + "os" + + "github.com/Wacky404/lurchers/internal/url" +) + +type Proc struct { + ID *os.Process + ctx context.Context + opts *url.UserParams +} diff --git a/internal/url/addr.go b/internal/url/addr.go new file mode 100644 index 0000000..555a596 --- /dev/null +++ b/internal/url/addr.go @@ -0,0 +1,32 @@ +package url + +import ( + "errors" + "strings" +) + +type Websites_t []string + +type UserParams struct { + Websites Websites_t + Timeout int + Retry bool +} + +func (u *UserParams) NewWebsite(w string) (*Websites_t, error) { + if !strings.HasPrefix(w, "http://") || !strings.HasPrefix(w, "https://") { + return &(*u).Websites, errors.New("url entered is not an http or https url") + } + + u.Websites[len((*u).Websites)] = w + + return &(*u).Websites, nil +} + +func (u *UserParams) GetWebsites() (*Websites_t, error) { + if (*u).Websites == nil { + return nil, errors.New("no websites given in user params") + } + + return &(*u).Websites, nil +} diff --git a/urls/addr.go b/urls/addr.go deleted file mode 100644 index 6de7d3a..0000000 --- a/urls/addr.go +++ /dev/null @@ -1,30 +0,0 @@ -package url - -import ( - "errors" - "strings" -) - -type Websites map[string]string - -type userParams struct { - websites Websites - timeout int - retry bool -} - -func NewWebsite(w string) (string, error) { - if strings.HasPrefix(w, "http://") || strings.HasPrefix(w, "https://") { - return w, nil - } - - return w, errors.New("url entered is not an http or https url") -} - -func (u *userParams) Getwebsites() (map[string]string, error) { - if u.websites == nil { - return nil, errors.New("no websites given in user params") - } - - return u.websites, nil -} diff --git a/util/env.go b/util/env.go deleted file mode 100644 index 8c59052..0000000 --- a/util/env.go +++ /dev/null @@ -1,12 +0,0 @@ -package util - -import "os" - -func GetVar(key string, fallback string) string { - val, ok := os.LookupEnv(key) - if !ok { - return fallback - } - - return val -} diff --git a/util/logger.go b/util/logger.go deleted file mode 100644 index 94d0563..0000000 --- a/util/logger.go +++ /dev/null @@ -1,99 +0,0 @@ -package util - -import ( - "context" - "fmt" - "log/slog" - "os" -) - -type MultiHandler struct { - handlers []slog.Handler -} - -func NewMultiHandler(handlers ...slog.Handler) *MultiHandler { - return &MultiHandler{handlers: handlers} -} - -func (m *MultiHandler) Enabled(ctx context.Context, level slog.Level) bool { - for _, h := range m.handlers { - if h.Enabled(ctx, level) { - return true - } - } - return false -} - -func (m *MultiHandler) Handle(ctx context.Context, record slog.Record) error { - for _, h := range m.handlers { - _ = h.Handle(ctx, record) // Process each handler, ignoring errors - } - return nil -} - -func (m *MultiHandler) WithAttrs(attrs []slog.Attr) slog.Handler { - newHandlers := make([]slog.Handler, len(m.handlers)) - for i, h := range m.handlers { - newHandlers[i] = h.WithAttrs(attrs) - } - return &MultiHandler{handlers: newHandlers} -} - -func (m *MultiHandler) WithGroup(name string) slog.Handler { - newHandlers := make([]slog.Handler, len(m.handlers)) - for i, h := range m.handlers { - newHandlers[i] = h.WithGroup(name) - } - return &MultiHandler{handlers: newHandlers} -} - -type options struct { - logName string - stdoutLevel slog.Level - fileLevel slog.Level -} - -func SetupLogger(opts ...func(*options)) (*os.File, error) { - o := options{ - logName: "logs/Default.log", - stdoutLevel: slog.LevelInfo, - fileLevel: slog.LevelDebug, - } - - for _, opt := range opts { - opt(&o) - } - - logFile, err := os.OpenFile(o.logName, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o666) - if err != nil { - return nil, fmt.Errorf("no file opened for logs: %v", err) - } - - stdoutHandler := slog.NewTextHandler(os.Stdout, - &slog.HandlerOptions{Level: o.stdoutLevel}) - fileHandler := slog.NewJSONHandler(logFile, - &slog.HandlerOptions{Level: o.fileLevel}) - - logger := slog.New(NewMultiHandler(stdoutHandler, fileHandler)) - slog.SetDefault(logger) - - return logFile, err -} - -func WithLogName(name string) func(*options) { - return func(o *options) { - o.logName = name - } -} - -func WithStdoutLevel(level slog.Level) func(*options) { - return func(o *options) { - o.stdoutLevel = level - } -} - -func WithFileLevel(level slog.Level) func(*options) { - return func(o *options) { - o.fileLevel = level - } -} |
