package infrastructure // docker_scraper.go — Seeds the catalog with official Docker Hub images as // ProcessingResources at API startup, then refreshes on a configurable interval. // // Each image version (tag) becomes one *peerless* ProcessingInstance: // - CreatorID = "" (no owning peer) // - Partnerships = nil (no partnerships) // - Origin.Ref = "docker.io/:" (non-empty registry ref) // // This satisfies ResourceInstance.IsPeerless() and makes every instance freely // accessible to all peers without any pricing negotiation. // // Environment variables (all optional): // // DOCKER_SCRAPER_ENABLED true | false (default: true) // DOCKER_SCRAPER_IMAGES comma-separated list of images to track. // Format: "name" for official library images, // "org/name" for user/org images. // Default: a curated set of popular official images. // DOCKER_SCRAPER_MAX_TAGS max tags to import per image (default: 10) // DOCKER_SCRAPER_INTERVAL_H refresh interval in hours (default: 24) import ( "encoding/json" "fmt" "io" "net/http" "os" "regexp" "strconv" "strings" "time" "cloud.o-forge.io/core/oc-lib/dbs" "cloud.o-forge.io/core/oc-lib/models/common/enum" "cloud.o-forge.io/core/oc-lib/models/common/models" "cloud.o-forge.io/core/oc-lib/models/resources" "cloud.o-forge.io/core/oc-lib/models/utils" "cloud.o-forge.io/core/oc-lib/tools" "github.com/google/uuid" ) // ─── Configuration ──────────────────────────────────────────────────────────── // DockerImageSpec identifies one Docker Hub repository to scrape. // Namespace is "library" for official images, or the org/user name otherwise. type DockerImageSpec struct { Namespace string Name string } // scraperConfig holds all runtime parameters for the Docker Hub scraper. type scraperConfig struct { Enabled bool Images []DockerImageSpec MaxTags int IntervalHours int } // defaultImages is the baseline catalog seeded when DOCKER_SCRAPER_IMAGES is not set. var defaultImages = []DockerImageSpec{ {Namespace: "library", Name: "ubuntu"}, {Namespace: "library", Name: "debian"}, {Namespace: "library", Name: "alpine"}, {Namespace: "library", Name: "python"}, {Namespace: "library", Name: "golang"}, {Namespace: "library", Name: "node"}, {Namespace: "library", Name: "nginx"}, {Namespace: "library", Name: "postgres"}, {Namespace: "library", Name: "redis"}, {Namespace: "library", Name: "mysql"}, {Namespace: "library", Name: "redmine"}, {Namespace: "library", Name: "ruby"}, {Namespace: "library", Name: "rabbitmq"}, {Namespace: "library", Name: "nextcloud"}, {Namespace: "library", Name: "php"}, {Namespace: "library", Name: "wordpress"}, {Namespace: "library", Name: "fluentd"}, {Namespace: "library", Name: "gradle"}, {Namespace: "library", Name: "mongo"}, {Namespace: "library", Name: "clickhouse"}, {Namespace: "library", Name: "mariadb"}, {Namespace: "library", Name: "eclipse-temurin"}, {Namespace: "library", Name: "sonarqube"}, {Namespace: "library", Name: "neo4j"}, {Namespace: "library", Name: "rust"}, {Namespace: "library", Name: "oraclelinux"}, {Namespace: "library", Name: "openjdk"}, {Namespace: "library", Name: "traefik"}, {Namespace: "library", Name: "ghost"}, {Namespace: "library", Name: "docker"}, {Namespace: "library", Name: "websphere-liberty"}, {Namespace: "library", Name: "open-liberty"}, {Namespace: "library", Name: "storm"}, {Namespace: "library", Name: "swift"}, {Namespace: "library", Name: "rocket.chat"}, {Namespace: "library", Name: "odoo"}, {Namespace: "library", Name: "busybox"}, {Namespace: "library", Name: "nats"}, {Namespace: "library", Name: "mageia"}, {Namespace: "library", Name: "tomcat"}, {Namespace: "library", Name: "perl"}, {Namespace: "library", Name: "xwiki"}, {Namespace: "library", Name: "cassandra"}, {Namespace: "library", Name: "varnish"}, {Namespace: "library", Name: "ibm-semeru-runtimes"}, {Namespace: "library", Name: "archlinux"}, {Namespace: "library", Name: "clojure"}, {Namespace: "library", Name: "maven"}, {Namespace: "library", Name: "buildpack-deps"}, {Namespace: "library", Name: "solr"}, {Namespace: "library", Name: "groovy"}, {Namespace: "library", Name: "phpmyadmin"}, {Namespace: "library", Name: "hylang"}, {Namespace: "library", Name: "joomla"}, {Namespace: "library", Name: "matomo"}, {Namespace: "library", Name: "drupal"}, {Namespace: "library", Name: "yourls"}, {Namespace: "library", Name: "haproxy"}, {Namespace: "library", Name: "elixir"}, {Namespace: "library", Name: "geonetwork"}, {Namespace: "library", Name: "convertigo"}, {Namespace: "library", Name: "erlang"}, {Namespace: "library", Name: "azul-zulu"}, {Namespace: "library", Name: "kibana"}, {Namespace: "library", Name: "percona"}, {Namespace: "library", Name: "logstash"}, {Namespace: "library", Name: "elasticsearch"}, {Namespace: "library", Name: "krakend"}, {Namespace: "library", Name: "postfixadmin"}, {Namespace: "library", Name: "monica"}, {Namespace: "library", Name: "friendica"}, {Namespace: "library", Name: "sapmachine"}, {Namespace: "library", Name: "dart"}, {Namespace: "library", Name: "spiped"}, {Namespace: "library", Name: "amazoncorreto"}, {Namespace: "library", Name: "zookeeper"}, {Namespace: "library", Name: "julia"}, {Namespace: "library", Name: "gcc"}, {Namespace: "library", Name: "ibmjava"}, {Namespace: "library", Name: "mediawiki"}, {Namespace: "library", Name: "couchbase"}, {Namespace: "library", Name: "jetty"}, {Namespace: "library", Name: "sparl"}, {Namespace: "library", Name: "tomee"}, {Namespace: "library", Name: "kapacitor"}, {Namespace: "library", Name: "ros"}, {Namespace: "library", Name: "silverpeas"}, {Namespace: "library", Name: "jruby"}, {Namespace: "library", Name: "neurodebian"}, {Namespace: "library", Name: "flink"}, {Namespace: "library", Name: "pypy"}, {Namespace: "library", Name: "orientdb"}, {Namespace: "library", Name: "liquidbase"}, {Namespace: "library", Name: "haxe"}, {Namespace: "library", Name: "r-base"}, {Namespace: "library", Name: "lighstreamer"}, {Namespace: "library", Name: "kong"}, {Namespace: "library", Name: "aerospike"}, {Namespace: "library", Name: "influxdb"}, {Namespace: "library", Name: "irssi"}, {Namespace: "library", Name: "rakudo-star"}, {Namespace: "library", Name: "satosa"}, {Namespace: "library", Name: "rethinkdb"}, {Namespace: "library", Name: "chronograf"}, {Namespace: "library", Name: "memcached"}, {Namespace: "library", Name: "backdrop"}, {Namespace: "library", Name: "telegraf"}, {Namespace: "library", Name: "httpd"}, {Namespace: "library", Name: "haskell"}, {Namespace: "library", Name: "emqx"}, {Namespace: "library", Name: "swipl"}, {Namespace: "library", Name: "couchdb"}, {Namespace: "library", Name: "hitch"}, {Namespace: "library", Name: "composer"}, {Namespace: "library", Name: "adminer"}, {Namespace: "library", Name: "amazonlinux"}, {Namespace: "library", Name: "bash"}, {Namespace: "library", Name: "caddy"}, {Namespace: "library", Name: "arangodb"}, {Namespace: "library", Name: "bonita"}, {Namespace: "library", Name: "photon"}, {Namespace: "library", Name: "almalinux"}, {Namespace: "library", Name: "teamspeak"}, {Namespace: "library", Name: "fedora"}, {Namespace: "library", Name: "eclipse-mosquitto"}, {Namespace: "library", Name: "registry"}, {Namespace: "library", Name: "eggdrop"}, {Namespace: "library", Name: "znc"}, {Namespace: "library", Name: "api-firewall"}, {Namespace: "library", Name: "alt"}, {Namespace: "library", Name: "unit"}, {Namespace: "library", Name: "clearlinux"}, {Namespace: "library", Name: "gazebo"}, {Namespace: "library", Name: "mongo-express"}, {Namespace: "library", Name: "plone"}, {Namespace: "library", Name: "cirros"}, {Namespace: "library", Name: "mono"}, {Namespace: "library", Name: "nats-streaming"}, {Namespace: "library", Name: "sl"}, {Namespace: "library", Name: "rockylinux"}, {Namespace: "library", Name: "notary"}, {Namespace: "library", Name: "vault"}, {Namespace: "library", Name: "jobber"}, {Namespace: "library", Name: "consul"}, {Namespace: "library", Name: "php-zendserver"}, {Namespace: "library", Name: "centos"}, {Namespace: "library", Name: "express-gateway"}, {Namespace: "library", Name: "clefos"}, {Namespace: "library", Name: "adoptopenjdk"}, {Namespace: "library", Name: "thrift"}, {Namespace: "library", Name: "rapidoid"}, {Namespace: "library", Name: "kaazing-gateway"}, {Namespace: "library", Name: "nuxeo"}, {Namespace: "library", Name: "neo4j"}, {Namespace: "library", Name: "fsharp"}, {Namespace: "library", Name: "sourcemage"}, {Namespace: "library", Name: "swarm"}, {Namespace: "library", Name: "euleros"}, {Namespace: "library", Name: "crux"}, {Namespace: "library", Name: "sentry"}, {Namespace: "library", Name: "known"}, {Namespace: "library", Name: "opensuse"}, {Namespace: "library", Name: "owncloud"}, {Namespace: "library", Name: "piwik"}, {Namespace: "library", Name: "jenkins"}, {Namespace: "library", Name: "celery"}, {Namespace: "library", Name: "iojs"}, {Namespace: "library", Name: "java"}, {Namespace: "library", Name: "rails"}, {Namespace: "library", Name: "django"}, {Namespace: "library", Name: "glassfish"}, {Namespace: "library", Name: "hipache"}, {Namespace: "library", Name: "ubuntu-upstart"}, {Namespace: "library", Name: "ubuntu-debootstrap"}, {Namespace: "library", Name: "docker-dev"}, {Namespace: "library", Name: "scratch"}, } // scraperConfigFromEnv reads scraper configuration from environment variables // and returns a populated scraperConfig with sensible defaults. func scraperConfigFromEnv() scraperConfig { cfg := scraperConfig{ Enabled: true, MaxTags: 10, IntervalHours: 24, Images: defaultImages, } if v := os.Getenv("DOCKER_SCRAPER_ENABLED"); v == "false" { cfg.Enabled = false } if v := os.Getenv("DOCKER_SCRAPER_MAX_TAGS"); v != "" { if n, err := strconv.Atoi(v); err == nil && n > 0 { cfg.MaxTags = n } } if v := os.Getenv("DOCKER_SCRAPER_INTERVAL_H"); v != "" { if h, err := strconv.Atoi(v); err == nil && h > 0 { cfg.IntervalHours = h } } if v := os.Getenv("DOCKER_SCRAPER_IMAGES"); v != "" { var specs []DockerImageSpec for _, raw := range strings.Split(v, ",") { raw = strings.TrimSpace(raw) if raw == "" { continue } if parts := strings.SplitN(raw, "/", 2); len(parts) == 2 { specs = append(specs, DockerImageSpec{Namespace: parts[0], Name: parts[1]}) } else { specs = append(specs, DockerImageSpec{Namespace: "library", Name: raw}) } } if len(specs) > 0 { cfg.Images = specs } } return cfg } // ─── Docker Hub API types ────────────────────────────────────────────────────── type hubRepoInfo struct { Description string `json:"description"` FullDescription string `json:"full_description"` } type hubTagImage struct { Architecture string `json:"architecture"` OS string `json:"os"` Digest string `json:"digest"` } type hubTag struct { Name string `json:"name"` FullSize int64 `json:"full_size"` LastUpdated string `json:"last_updated"` Images []hubTagImage `json:"images"` } type hubTagsResponse struct { Count int `json:"count"` Results []hubTag `json:"results"` } // reMarkdownImage matches the first Markdown image in a string, e.g. ![logo](https://…) var reMarkdownImage = regexp.MustCompile(`!\[[^\]]*\]\((https?://[^)]+)\)`) // extractLogoURL returns the first image URL found in a Markdown string, or "". func extractLogoURL(markdown string) string { if m := reMarkdownImage.FindStringSubmatch(markdown); len(m) == 2 { return m[1] } return "" } // fetchJSON performs a GET request to url and decodes the JSON body into out. func fetchJSON(url string, out interface{}) error { resp, err := http.Get(url) //nolint:noctx if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return fmt.Errorf("HTTP %d for %s", resp.StatusCode, url) } body, err := io.ReadAll(resp.Body) if err != nil { return err } return json.Unmarshal(body, out) } // ─── Entry point ────────────────────────────────────────────────────────────── // StartDockerScraper starts the background Docker Hub scraper goroutine. // It runs a full scrape immediately at startup, then repeats on the configured // interval. This function blocks forever and is designed to be called via // `go infrastructure.StartDockerScraper()` from main(). func StartDockerScraper() { cfg := scraperConfigFromEnv() if !cfg.Enabled { fmt.Println("[docker-scraper] disabled (DOCKER_SCRAPER_ENABLED=false)") return } fmt.Printf("[docker-scraper] started — images=%d maxTags=%d interval=%dh\n", len(cfg.Images), cfg.MaxTags, cfg.IntervalHours) runScrape(cfg) ticker := time.NewTicker(time.Duration(cfg.IntervalHours) * time.Hour) defer ticker.Stop() for range ticker.C { runScrape(cfg) } } // runScrape executes one full scrape cycle for all configured images. func runScrape(cfg scraperConfig) { fmt.Printf("[docker-scraper] cycle started at %s\n", time.Now().Format(time.RFC3339)) for _, spec := range cfg.Images { if err := scrapeImage(spec, cfg.MaxTags); err != nil { fmt.Printf("[docker-scraper] %s/%s: %v\n", spec.Namespace, spec.Name, err) } } fmt.Printf("[docker-scraper] cycle done at %s\n", time.Now().Format(time.RFC3339)) } // ─── Per-image scraping ──────────────────────────────────────────────────────── // scrapeImage fetches Docker Hub metadata for one repository, then either creates // a new ProcessingResource in the catalog or extends the existing one with any // missing tag-instances. func scrapeImage(spec DockerImageSpec, maxTags int) error { // ── Fetch image metadata ────────────────────────────────────────────────── var info hubRepoInfo repoURL := fmt.Sprintf("https://hub.docker.com/v2/repositories/%s/%s/", spec.Namespace, spec.Name) if err := fetchJSON(repoURL, &info); err != nil { return fmt.Errorf("fetch repo info: %w", err) } // ── Fetch tags ──────────────────────────────────────────────────────────── tagsURL := fmt.Sprintf( "https://hub.docker.com/v2/repositories/%s/%s/tags?page_size=%d&ordering=last_updated", spec.Namespace, spec.Name, maxTags) var tagsResp hubTagsResponse if err := fetchJSON(tagsURL, &tagsResp); err != nil { return fmt.Errorf("fetch tags: %w", err) } if len(tagsResp.Results) == 0 { return nil // nothing to upsert } adminReq := &tools.APIRequest{Admin: true} accessor := (&resources.ProcessingResource{}).GetAccessor(adminReq) resourceName := spec.resourceName() existing := findProcessingResourceByName(accessor, resourceName) if existing == nil { return createDockerProcessingResource(accessor, spec, resourceName, info, tagsResp.Results) } return syncDockerInstances(accessor, existing, spec, tagsResp.Results) } // resourceName returns the canonical catalog name for a DockerImageSpec. // Official (library) images use just the image name; others use "org/image". func (s DockerImageSpec) resourceName() string { if s.Namespace == "library" { return s.Name } return s.Namespace + "/" + s.Name } // dockerRef builds the canonical pull reference for an image+tag pair. func dockerRef(spec DockerImageSpec, tag string) string { if spec.Namespace == "library" { return "docker.io/" + spec.Name + ":" + tag } return "docker.io/" + spec.Namespace + "/" + spec.Name + ":" + tag } // ─── DB helpers ─────────────────────────────────────────────────────────────── // findProcessingResourceByName loads all ProcessingResources (both draft and // published) and returns the first whose name matches exactly. func findProcessingResourceByName(accessor utils.Accessor, name string) *resources.ProcessingResource { filters := &dbs.Filters{ Or: map[string][]dbs.Filter{ "abstractresource.abstractobject.name": {{ Operator: dbs.LIKE.String(), Value: name, }}, "abstractobject.name": {{ Operator: dbs.LIKE.String(), Value: name, }}, }, } for _, draft := range []bool{false, true} { results, _, _ := accessor.Search(filters, "", draft, 0, 10) for _, r := range results { if pr, ok := r.(*resources.ProcessingResource); ok && pr.GetName() == name { return pr } } } return nil } // createDockerProcessingResource stores a brand-new ProcessingResource with one // peerless instance per Docker Hub tag, then publishes it (IsDraft = false). func createDockerProcessingResource( accessor utils.Accessor, spec DockerImageSpec, name string, info hubRepoInfo, tags []hubTag, ) error { resource := &resources.ProcessingResource{ AbstractInstanciatedResource: resources.AbstractInstanciatedResource[*resources.ProcessingInstance]{ AbstractResource: resources.AbstractResource{ AbstractObject: utils.AbstractObject{ UUID: uuid.New().String(), Name: name, }, Description: info.FullDescription, ShortDescription: info.Description, Logo: extractLogoURL(info.FullDescription), Owners: []utils.Owner{ {Name: "https://hub.docker.com/", Logo: "https://icones8.fr/icon/Wln8Z3PcXanx/logo-docker"}, }, }, }, Infrastructure: enum.DOCKER, OpenSource: true, IsService: false, } for i := range tags { resource.AddInstances(buildPeerlessInstance(spec, tags[i])) } // StoreOne goes through GenericStoreOne which calls AbstractResource.StoreDraftDefault() // setting IsDraft=true. We then publish with a raw update. stored, _, err := accessor.StoreOne(resource) if err != nil { return fmt.Errorf("store %q: %w", name, err) } pr := stored.(*resources.ProcessingResource) pr.IsDraft = false if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), accessor); err != nil { return fmt.Errorf("publish %q: %w", name, err) } fmt.Printf("[docker-scraper] created %q with %d instances\n", name, len(tags)) return nil } // syncDockerInstances adds to an existing ProcessingResource any tag-instances // that are not yet present (identified by Origin.Ref). Already-present tags // are left untouched to preserve any manually enriched metadata. func syncDockerInstances( accessor utils.Accessor, resource *resources.ProcessingResource, spec DockerImageSpec, tags []hubTag, ) error { existing := map[string]bool{} for _, inst := range resource.Instances { existing[inst.GetOrigin().Ref] = true } added := 0 for i := range tags { ref := dockerRef(spec, tags[i].Name) if existing[ref] { continue } resource.AddInstances(buildPeerlessInstance(spec, tags[i])) added++ } if added == 0 { return nil } if _, _, err := utils.GenericRawUpdateOne(resource, resource.GetID(), accessor); err != nil { return fmt.Errorf("sync instances for %q: %w", resource.GetName(), err) } fmt.Printf("[docker-scraper] added %d new instances to %q\n", added, resource.GetName()) return nil } // ─── Instance builder ───────────────────────────────────────────────────────── // buildPeerlessInstance creates a ProcessingInstance that satisfies // ResourceInstance.IsPeerless(): // // CreatorID = "" (zero value — no owning peer) // Partnerships = nil (zero value — no partnerships) // Origin.Ref != "" (set to the canonical docker pull reference) // // ProcessingInstance.StoreDraftDefault() enforces this invariant on write. func buildPeerlessInstance(spec DockerImageSpec, tag hubTag) *resources.ProcessingInstance { ref := dockerRef(spec, tag.Name) // Collect architecture hint from the first image manifest entry (if any). arch := "" if len(tag.Images) > 0 { arch = tag.Images[0].Architecture } return &resources.ProcessingInstance{ ResourceInstance: resources.ResourceInstance[*resources.ResourcePartnerShip[*resources.ProcessingResourcePricingProfile]]{ AbstractObject: utils.AbstractObject{ UUID: uuid.New().String(), Name: tag.Name, // CreatorID intentionally left empty — required for IsPeerless() }, Origin: resources.OriginMeta{ Type: resources.OriginPublic, Ref: ref, License: "", // filled in per-image if known (e.g. MIT, Apache-2.0) Verified: true, // official Docker Hub images are considered verified }, // Env / Inputs / Outputs left empty — can be enriched manually or by // future scrapers that read image labels / Docker Hub documentation. }, Access: &resources.ProcessingResourceAccess{ Container: &models.Container{ Image: ref, // Command, Args, Env, Volumes left empty — image defaults apply. Env: map[string]string{ "ARCH": arch, }, }, }, } }