Files
oc-catalog/infrastructure/docker_scraper.go

583 lines
21 KiB
Go
Raw Normal View History

2026-04-01 15:56:05 +02:00
package infrastructure
// docker_scraper.go — Seeds the catalog with official Docker Hub images as
// ProcessingResources at API startup, then refreshes on a configurable interval.
//
// Each image version (tag) becomes one *peerless* ProcessingInstance:
// - CreatorID = "" (no owning peer)
// - Partnerships = nil (no partnerships)
// - Origin.Ref = "docker.io/<img>:<tag>" (non-empty registry ref)
//
// This satisfies ResourceInstance.IsPeerless() and makes every instance freely
// accessible to all peers without any pricing negotiation.
//
// Environment variables (all optional):
//
// DOCKER_SCRAPER_ENABLED true | false (default: true)
// DOCKER_SCRAPER_IMAGES comma-separated list of images to track.
// Format: "name" for official library images,
// "org/name" for user/org images.
// Default: a curated set of popular official images.
// DOCKER_SCRAPER_MAX_TAGS max tags to import per image (default: 10)
// DOCKER_SCRAPER_INTERVAL_H refresh interval in hours (default: 24)
import (
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
"strings"
"time"
"cloud.o-forge.io/core/oc-lib/dbs"
"cloud.o-forge.io/core/oc-lib/models/common/enum"
"cloud.o-forge.io/core/oc-lib/models/common/models"
"cloud.o-forge.io/core/oc-lib/models/resources"
"cloud.o-forge.io/core/oc-lib/models/utils"
"cloud.o-forge.io/core/oc-lib/tools"
"github.com/google/uuid"
)
// ─── Configuration ────────────────────────────────────────────────────────────
// DockerImageSpec identifies one Docker Hub repository to scrape.
// Namespace is "library" for official images, or the org/user name otherwise.
type DockerImageSpec struct {
Namespace string
Name string
}
// scraperConfig holds all runtime parameters for the Docker Hub scraper.
type scraperConfig struct {
Enabled bool
Images []DockerImageSpec
MaxTags int
IntervalHours int
}
// defaultImages is the baseline catalog seeded when DOCKER_SCRAPER_IMAGES is not set.
var defaultImages = []DockerImageSpec{
{Namespace: "library", Name: "ubuntu"},
{Namespace: "library", Name: "debian"},
{Namespace: "library", Name: "alpine"},
{Namespace: "library", Name: "python"},
{Namespace: "library", Name: "golang"},
{Namespace: "library", Name: "node"},
{Namespace: "library", Name: "nginx"},
{Namespace: "library", Name: "postgres"},
{Namespace: "library", Name: "redis"},
{Namespace: "library", Name: "mysql"},
{Namespace: "library", Name: "redmine"},
{Namespace: "library", Name: "ruby"},
{Namespace: "library", Name: "rabbitmq"},
{Namespace: "library", Name: "nextcloud"},
{Namespace: "library", Name: "php"},
{Namespace: "library", Name: "wordpress"},
{Namespace: "library", Name: "fluentd"},
{Namespace: "library", Name: "gradle"},
{Namespace: "library", Name: "mongo"},
{Namespace: "library", Name: "clickhouse"},
{Namespace: "library", Name: "mariadb"},
{Namespace: "library", Name: "eclipse-temurin"},
{Namespace: "library", Name: "sonarqube"},
{Namespace: "library", Name: "neo4j"},
{Namespace: "library", Name: "rust"},
{Namespace: "library", Name: "oraclelinux"},
{Namespace: "library", Name: "openjdk"},
{Namespace: "library", Name: "traefik"},
{Namespace: "library", Name: "ghost"},
{Namespace: "library", Name: "docker"},
{Namespace: "library", Name: "websphere-liberty"},
{Namespace: "library", Name: "open-liberty"},
{Namespace: "library", Name: "storm"},
{Namespace: "library", Name: "swift"},
{Namespace: "library", Name: "rocket.chat"},
{Namespace: "library", Name: "odoo"},
{Namespace: "library", Name: "busybox"},
{Namespace: "library", Name: "nats"},
{Namespace: "library", Name: "mageia"},
{Namespace: "library", Name: "tomcat"},
{Namespace: "library", Name: "perl"},
{Namespace: "library", Name: "xwiki"},
{Namespace: "library", Name: "cassandra"},
{Namespace: "library", Name: "varnish"},
{Namespace: "library", Name: "ibm-semeru-runtimes"},
{Namespace: "library", Name: "archlinux"},
{Namespace: "library", Name: "clojure"},
{Namespace: "library", Name: "maven"},
{Namespace: "library", Name: "buildpack-deps"},
{Namespace: "library", Name: "solr"},
{Namespace: "library", Name: "groovy"},
{Namespace: "library", Name: "phpmyadmin"},
{Namespace: "library", Name: "hylang"},
{Namespace: "library", Name: "joomla"},
{Namespace: "library", Name: "matomo"},
{Namespace: "library", Name: "drupal"},
{Namespace: "library", Name: "yourls"},
{Namespace: "library", Name: "haproxy"},
{Namespace: "library", Name: "elixir"},
{Namespace: "library", Name: "geonetwork"},
{Namespace: "library", Name: "convertigo"},
{Namespace: "library", Name: "erlang"},
{Namespace: "library", Name: "azul-zulu"},
{Namespace: "library", Name: "kibana"},
{Namespace: "library", Name: "percona"},
{Namespace: "library", Name: "logstash"},
{Namespace: "library", Name: "elasticsearch"},
{Namespace: "library", Name: "krakend"},
{Namespace: "library", Name: "postfixadmin"},
{Namespace: "library", Name: "monica"},
{Namespace: "library", Name: "friendica"},
{Namespace: "library", Name: "sapmachine"},
{Namespace: "library", Name: "dart"},
{Namespace: "library", Name: "spiped"},
{Namespace: "library", Name: "amazoncorreto"},
{Namespace: "library", Name: "zookeeper"},
{Namespace: "library", Name: "julia"},
{Namespace: "library", Name: "gcc"},
{Namespace: "library", Name: "ibmjava"},
{Namespace: "library", Name: "mediawiki"},
{Namespace: "library", Name: "couchbase"},
{Namespace: "library", Name: "jetty"},
{Namespace: "library", Name: "sparl"},
{Namespace: "library", Name: "tomee"},
{Namespace: "library", Name: "kapacitor"},
{Namespace: "library", Name: "ros"},
{Namespace: "library", Name: "silverpeas"},
{Namespace: "library", Name: "jruby"},
{Namespace: "library", Name: "neurodebian"},
{Namespace: "library", Name: "flink"},
{Namespace: "library", Name: "pypy"},
{Namespace: "library", Name: "orientdb"},
{Namespace: "library", Name: "liquidbase"},
{Namespace: "library", Name: "haxe"},
{Namespace: "library", Name: "r-base"},
{Namespace: "library", Name: "lighstreamer"},
{Namespace: "library", Name: "kong"},
{Namespace: "library", Name: "aerospike"},
{Namespace: "library", Name: "influxdb"},
{Namespace: "library", Name: "irssi"},
{Namespace: "library", Name: "rakudo-star"},
{Namespace: "library", Name: "satosa"},
{Namespace: "library", Name: "rethinkdb"},
{Namespace: "library", Name: "chronograf"},
{Namespace: "library", Name: "memcached"},
{Namespace: "library", Name: "backdrop"},
{Namespace: "library", Name: "telegraf"},
{Namespace: "library", Name: "httpd"},
{Namespace: "library", Name: "haskell"},
{Namespace: "library", Name: "emqx"},
{Namespace: "library", Name: "swipl"},
{Namespace: "library", Name: "couchdb"},
{Namespace: "library", Name: "hitch"},
{Namespace: "library", Name: "composer"},
{Namespace: "library", Name: "adminer"},
{Namespace: "library", Name: "amazonlinux"},
{Namespace: "library", Name: "bash"},
{Namespace: "library", Name: "caddy"},
{Namespace: "library", Name: "arangodb"},
{Namespace: "library", Name: "bonita"},
{Namespace: "library", Name: "photon"},
{Namespace: "library", Name: "almalinux"},
{Namespace: "library", Name: "teamspeak"},
{Namespace: "library", Name: "fedora"},
{Namespace: "library", Name: "eclipse-mosquitto"},
{Namespace: "library", Name: "registry"},
{Namespace: "library", Name: "eggdrop"},
{Namespace: "library", Name: "znc"},
{Namespace: "library", Name: "api-firewall"},
{Namespace: "library", Name: "alt"},
{Namespace: "library", Name: "unit"},
{Namespace: "library", Name: "clearlinux"},
{Namespace: "library", Name: "gazebo"},
{Namespace: "library", Name: "mongo-express"},
{Namespace: "library", Name: "plone"},
{Namespace: "library", Name: "cirros"},
{Namespace: "library", Name: "mono"},
{Namespace: "library", Name: "nats-streaming"},
{Namespace: "library", Name: "sl"},
{Namespace: "library", Name: "rockylinux"},
{Namespace: "library", Name: "notary"},
{Namespace: "library", Name: "vault"},
{Namespace: "library", Name: "jobber"},
{Namespace: "library", Name: "consul"},
{Namespace: "library", Name: "php-zendserver"},
{Namespace: "library", Name: "centos"},
{Namespace: "library", Name: "express-gateway"},
{Namespace: "library", Name: "clefos"},
{Namespace: "library", Name: "adoptopenjdk"},
{Namespace: "library", Name: "thrift"},
{Namespace: "library", Name: "rapidoid"},
{Namespace: "library", Name: "kaazing-gateway"},
{Namespace: "library", Name: "nuxeo"},
{Namespace: "library", Name: "neo4j"},
{Namespace: "library", Name: "fsharp"},
{Namespace: "library", Name: "sourcemage"},
{Namespace: "library", Name: "swarm"},
{Namespace: "library", Name: "euleros"},
{Namespace: "library", Name: "crux"},
{Namespace: "library", Name: "sentry"},
{Namespace: "library", Name: "known"},
{Namespace: "library", Name: "opensuse"},
{Namespace: "library", Name: "owncloud"},
{Namespace: "library", Name: "piwik"},
{Namespace: "library", Name: "jenkins"},
{Namespace: "library", Name: "celery"},
{Namespace: "library", Name: "iojs"},
{Namespace: "library", Name: "java"},
{Namespace: "library", Name: "rails"},
{Namespace: "library", Name: "django"},
{Namespace: "library", Name: "glassfish"},
{Namespace: "library", Name: "hipache"},
{Namespace: "library", Name: "ubuntu-upstart"},
{Namespace: "library", Name: "ubuntu-debootstrap"},
{Namespace: "library", Name: "docker-dev"},
{Namespace: "library", Name: "scratch"},
}
// scraperConfigFromEnv reads scraper configuration from environment variables
// and returns a populated scraperConfig with sensible defaults.
func scraperConfigFromEnv() scraperConfig {
cfg := scraperConfig{
Enabled: true,
MaxTags: 10,
IntervalHours: 24,
Images: defaultImages,
}
if v := os.Getenv("DOCKER_SCRAPER_ENABLED"); v == "false" {
cfg.Enabled = false
}
if v := os.Getenv("DOCKER_SCRAPER_MAX_TAGS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
cfg.MaxTags = n
}
}
if v := os.Getenv("DOCKER_SCRAPER_INTERVAL_H"); v != "" {
if h, err := strconv.Atoi(v); err == nil && h > 0 {
cfg.IntervalHours = h
}
}
if v := os.Getenv("DOCKER_SCRAPER_IMAGES"); v != "" {
var specs []DockerImageSpec
for _, raw := range strings.Split(v, ",") {
raw = strings.TrimSpace(raw)
if raw == "" {
continue
}
if parts := strings.SplitN(raw, "/", 2); len(parts) == 2 {
specs = append(specs, DockerImageSpec{Namespace: parts[0], Name: parts[1]})
} else {
specs = append(specs, DockerImageSpec{Namespace: "library", Name: raw})
}
}
if len(specs) > 0 {
cfg.Images = specs
}
}
return cfg
}
// ─── Docker Hub API types ──────────────────────────────────────────────────────
type hubRepoInfo struct {
Description string `json:"description"`
FullDescription string `json:"full_description"`
}
type hubTagImage struct {
Architecture string `json:"architecture"`
OS string `json:"os"`
Digest string `json:"digest"`
}
type hubTag struct {
Name string `json:"name"`
FullSize int64 `json:"full_size"`
LastUpdated string `json:"last_updated"`
Images []hubTagImage `json:"images"`
}
type hubTagsResponse struct {
Count int `json:"count"`
Results []hubTag `json:"results"`
}
// reMarkdownImage matches the first Markdown image in a string, e.g. ![logo](https://…)
var reMarkdownImage = regexp.MustCompile(`!\[[^\]]*\]\((https?://[^)]+)\)`)
// extractLogoURL returns the first image URL found in a Markdown string, or "".
func extractLogoURL(markdown string) string {
if m := reMarkdownImage.FindStringSubmatch(markdown); len(m) == 2 {
return m[1]
}
return ""
}
// fetchJSON performs a GET request to url and decodes the JSON body into out.
func fetchJSON(url string, out interface{}) error {
resp, err := http.Get(url) //nolint:noctx
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("HTTP %d for %s", resp.StatusCode, url)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return err
}
return json.Unmarshal(body, out)
}
// ─── Entry point ──────────────────────────────────────────────────────────────
// StartDockerScraper starts the background Docker Hub scraper goroutine.
// It runs a full scrape immediately at startup, then repeats on the configured
// interval. This function blocks forever and is designed to be called via
// `go infrastructure.StartDockerScraper()` from main().
func StartDockerScraper() {
cfg := scraperConfigFromEnv()
if !cfg.Enabled {
fmt.Println("[docker-scraper] disabled (DOCKER_SCRAPER_ENABLED=false)")
return
}
fmt.Printf("[docker-scraper] started — images=%d maxTags=%d interval=%dh\n",
len(cfg.Images), cfg.MaxTags, cfg.IntervalHours)
runScrape(cfg)
ticker := time.NewTicker(time.Duration(cfg.IntervalHours) * time.Hour)
defer ticker.Stop()
for range ticker.C {
runScrape(cfg)
}
}
// runScrape executes one full scrape cycle for all configured images.
func runScrape(cfg scraperConfig) {
fmt.Printf("[docker-scraper] cycle started at %s\n", time.Now().Format(time.RFC3339))
for _, spec := range cfg.Images {
if err := scrapeImage(spec, cfg.MaxTags); err != nil {
fmt.Printf("[docker-scraper] %s/%s: %v\n", spec.Namespace, spec.Name, err)
}
}
fmt.Printf("[docker-scraper] cycle done at %s\n", time.Now().Format(time.RFC3339))
}
// ─── Per-image scraping ────────────────────────────────────────────────────────
// scrapeImage fetches Docker Hub metadata for one repository, then either creates
// a new ProcessingResource in the catalog or extends the existing one with any
// missing tag-instances.
func scrapeImage(spec DockerImageSpec, maxTags int) error {
// ── Fetch image metadata ──────────────────────────────────────────────────
var info hubRepoInfo
repoURL := fmt.Sprintf("https://hub.docker.com/v2/repositories/%s/%s/",
spec.Namespace, spec.Name)
if err := fetchJSON(repoURL, &info); err != nil {
return fmt.Errorf("fetch repo info: %w", err)
}
// ── Fetch tags ────────────────────────────────────────────────────────────
tagsURL := fmt.Sprintf(
"https://hub.docker.com/v2/repositories/%s/%s/tags?page_size=%d&ordering=last_updated",
spec.Namespace, spec.Name, maxTags)
var tagsResp hubTagsResponse
if err := fetchJSON(tagsURL, &tagsResp); err != nil {
return fmt.Errorf("fetch tags: %w", err)
}
if len(tagsResp.Results) == 0 {
return nil // nothing to upsert
}
adminReq := &tools.APIRequest{Admin: true}
accessor := (&resources.ProcessingResource{}).GetAccessor(adminReq)
resourceName := spec.resourceName()
existing := findProcessingResourceByName(accessor, resourceName)
if existing == nil {
return createDockerProcessingResource(accessor, spec, resourceName, info, tagsResp.Results)
}
return syncDockerInstances(accessor, existing, spec, tagsResp.Results)
}
// resourceName returns the canonical catalog name for a DockerImageSpec.
// Official (library) images use just the image name; others use "org/image".
func (s DockerImageSpec) resourceName() string {
if s.Namespace == "library" {
return s.Name
}
return s.Namespace + "/" + s.Name
}
// dockerRef builds the canonical pull reference for an image+tag pair.
func dockerRef(spec DockerImageSpec, tag string) string {
if spec.Namespace == "library" {
return "docker.io/" + spec.Name + ":" + tag
}
return "docker.io/" + spec.Namespace + "/" + spec.Name + ":" + tag
}
// ─── DB helpers ───────────────────────────────────────────────────────────────
// findProcessingResourceByName loads all ProcessingResources (both draft and
// published) and returns the first whose name matches exactly.
func findProcessingResourceByName(accessor utils.Accessor, name string) *resources.ProcessingResource {
filters := &dbs.Filters{
Or: map[string][]dbs.Filter{
"abstractresource.abstractobject.name": {{
Operator: dbs.LIKE.String(),
Value: name,
}},
"abstractobject.name": {{
Operator: dbs.LIKE.String(),
Value: name,
}},
},
}
for _, draft := range []bool{false, true} {
results, _, _ := accessor.Search(filters, "", draft)
for _, r := range results {
if pr, ok := r.(*resources.ProcessingResource); ok && pr.GetName() == name {
return pr
}
}
}
return nil
}
// createDockerProcessingResource stores a brand-new ProcessingResource with one
// peerless instance per Docker Hub tag, then publishes it (IsDraft = false).
func createDockerProcessingResource(
accessor utils.Accessor,
spec DockerImageSpec,
name string,
info hubRepoInfo,
tags []hubTag,
) error {
resource := &resources.ProcessingResource{
AbstractInstanciatedResource: resources.AbstractInstanciatedResource[*resources.ProcessingInstance]{
AbstractResource: resources.AbstractResource{
AbstractObject: utils.AbstractObject{
UUID: uuid.New().String(),
Name: name,
},
Description: info.FullDescription,
ShortDescription: info.Description,
Logo: extractLogoURL(info.FullDescription),
Owners: []utils.Owner{
{Name: "https://hub.docker.com/", Logo: "https://icones8.fr/icon/Wln8Z3PcXanx/logo-docker"},
},
},
},
Infrastructure: enum.DOCKER,
OpenSource: true,
IsService: false,
}
for i := range tags {
resource.AddInstances(buildPeerlessInstance(spec, tags[i]))
}
// StoreOne goes through GenericStoreOne which calls AbstractResource.StoreDraftDefault()
// setting IsDraft=true. We then publish with a raw update.
stored, _, err := accessor.StoreOne(resource)
if err != nil {
return fmt.Errorf("store %q: %w", name, err)
}
pr := stored.(*resources.ProcessingResource)
pr.IsDraft = false
if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), accessor); err != nil {
return fmt.Errorf("publish %q: %w", name, err)
}
fmt.Printf("[docker-scraper] created %q with %d instances\n", name, len(tags))
return nil
}
// syncDockerInstances adds to an existing ProcessingResource any tag-instances
// that are not yet present (identified by Origin.Ref). Already-present tags
// are left untouched to preserve any manually enriched metadata.
func syncDockerInstances(
accessor utils.Accessor,
resource *resources.ProcessingResource,
spec DockerImageSpec,
tags []hubTag,
) error {
existing := map[string]bool{}
for _, inst := range resource.Instances {
existing[inst.GetOrigin().Ref] = true
}
added := 0
for i := range tags {
ref := dockerRef(spec, tags[i].Name)
if existing[ref] {
continue
}
resource.AddInstances(buildPeerlessInstance(spec, tags[i]))
added++
}
if added == 0 {
return nil
}
if _, _, err := utils.GenericRawUpdateOne(resource, resource.GetID(), accessor); err != nil {
return fmt.Errorf("sync instances for %q: %w", resource.GetName(), err)
}
fmt.Printf("[docker-scraper] added %d new instances to %q\n", added, resource.GetName())
return nil
}
// ─── Instance builder ─────────────────────────────────────────────────────────
// buildPeerlessInstance creates a ProcessingInstance that satisfies
// ResourceInstance.IsPeerless():
//
// CreatorID = "" (zero value — no owning peer)
// Partnerships = nil (zero value — no partnerships)
// Origin.Ref != "" (set to the canonical docker pull reference)
//
// ProcessingInstance.StoreDraftDefault() enforces this invariant on write.
func buildPeerlessInstance(spec DockerImageSpec, tag hubTag) *resources.ProcessingInstance {
ref := dockerRef(spec, tag.Name)
// Collect architecture hint from the first image manifest entry (if any).
arch := ""
if len(tag.Images) > 0 {
arch = tag.Images[0].Architecture
}
return &resources.ProcessingInstance{
ResourceInstance: resources.ResourceInstance[*resources.ResourcePartnerShip[*resources.ProcessingResourcePricingProfile]]{
AbstractObject: utils.AbstractObject{
UUID: uuid.New().String(),
Name: tag.Name,
// CreatorID intentionally left empty — required for IsPeerless()
},
Origin: resources.OriginMeta{
Type: resources.OriginPublic,
Ref: ref,
License: "", // filled in per-image if known (e.g. MIT, Apache-2.0)
Verified: true, // official Docker Hub images are considered verified
},
// Env / Inputs / Outputs left empty — can be enriched manually or by
// future scrapers that read image labels / Docker Hub documentation.
},
Access: &resources.ProcessingResourceAccess{
Container: &models.Container{
Image: ref,
// Command, Args, Env, Volumes left empty — image defaults apply.
Env: map[string]string{
"ARCH": arch,
},
},
},
}
}