583 lines
21 KiB
Go
583 lines
21 KiB
Go
|
|
package infrastructure
|
||
|
|
|
||
|
|
// docker_scraper.go — Seeds the catalog with official Docker Hub images as
|
||
|
|
// ProcessingResources at API startup, then refreshes on a configurable interval.
|
||
|
|
//
|
||
|
|
// Each image version (tag) becomes one *peerless* ProcessingInstance:
|
||
|
|
// - CreatorID = "" (no owning peer)
|
||
|
|
// - Partnerships = nil (no partnerships)
|
||
|
|
// - Origin.Ref = "docker.io/<img>:<tag>" (non-empty registry ref)
|
||
|
|
//
|
||
|
|
// This satisfies ResourceInstance.IsPeerless() and makes every instance freely
|
||
|
|
// accessible to all peers without any pricing negotiation.
|
||
|
|
//
|
||
|
|
// Environment variables (all optional):
|
||
|
|
//
|
||
|
|
// DOCKER_SCRAPER_ENABLED true | false (default: true)
|
||
|
|
// DOCKER_SCRAPER_IMAGES comma-separated list of images to track.
|
||
|
|
// Format: "name" for official library images,
|
||
|
|
// "org/name" for user/org images.
|
||
|
|
// Default: a curated set of popular official images.
|
||
|
|
// DOCKER_SCRAPER_MAX_TAGS max tags to import per image (default: 10)
|
||
|
|
// DOCKER_SCRAPER_INTERVAL_H refresh interval in hours (default: 24)
|
||
|
|
|
||
|
|
import (
|
||
|
|
"encoding/json"
|
||
|
|
"fmt"
|
||
|
|
"io"
|
||
|
|
"net/http"
|
||
|
|
"os"
|
||
|
|
"regexp"
|
||
|
|
"strconv"
|
||
|
|
"strings"
|
||
|
|
"time"
|
||
|
|
|
||
|
|
"cloud.o-forge.io/core/oc-lib/dbs"
|
||
|
|
"cloud.o-forge.io/core/oc-lib/models/common/enum"
|
||
|
|
"cloud.o-forge.io/core/oc-lib/models/common/models"
|
||
|
|
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||
|
|
"cloud.o-forge.io/core/oc-lib/models/utils"
|
||
|
|
"cloud.o-forge.io/core/oc-lib/tools"
|
||
|
|
"github.com/google/uuid"
|
||
|
|
)
|
||
|
|
|
||
|
|
// ─── Configuration ────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
// DockerImageSpec identifies one Docker Hub repository to scrape.
|
||
|
|
// Namespace is "library" for official images, or the org/user name otherwise.
|
||
|
|
type DockerImageSpec struct {
|
||
|
|
Namespace string
|
||
|
|
Name string
|
||
|
|
}
|
||
|
|
|
||
|
|
// scraperConfig holds all runtime parameters for the Docker Hub scraper.
|
||
|
|
type scraperConfig struct {
|
||
|
|
Enabled bool
|
||
|
|
Images []DockerImageSpec
|
||
|
|
MaxTags int
|
||
|
|
IntervalHours int
|
||
|
|
}
|
||
|
|
|
||
|
|
// defaultImages is the baseline catalog seeded when DOCKER_SCRAPER_IMAGES is not set.
|
||
|
|
var defaultImages = []DockerImageSpec{
|
||
|
|
{Namespace: "library", Name: "ubuntu"},
|
||
|
|
{Namespace: "library", Name: "debian"},
|
||
|
|
{Namespace: "library", Name: "alpine"},
|
||
|
|
{Namespace: "library", Name: "python"},
|
||
|
|
{Namespace: "library", Name: "golang"},
|
||
|
|
{Namespace: "library", Name: "node"},
|
||
|
|
{Namespace: "library", Name: "nginx"},
|
||
|
|
{Namespace: "library", Name: "postgres"},
|
||
|
|
{Namespace: "library", Name: "redis"},
|
||
|
|
{Namespace: "library", Name: "mysql"},
|
||
|
|
{Namespace: "library", Name: "redmine"},
|
||
|
|
{Namespace: "library", Name: "ruby"},
|
||
|
|
{Namespace: "library", Name: "rabbitmq"},
|
||
|
|
{Namespace: "library", Name: "nextcloud"},
|
||
|
|
{Namespace: "library", Name: "php"},
|
||
|
|
{Namespace: "library", Name: "wordpress"},
|
||
|
|
{Namespace: "library", Name: "fluentd"},
|
||
|
|
{Namespace: "library", Name: "gradle"},
|
||
|
|
{Namespace: "library", Name: "mongo"},
|
||
|
|
{Namespace: "library", Name: "clickhouse"},
|
||
|
|
{Namespace: "library", Name: "mariadb"},
|
||
|
|
{Namespace: "library", Name: "eclipse-temurin"},
|
||
|
|
{Namespace: "library", Name: "sonarqube"},
|
||
|
|
{Namespace: "library", Name: "neo4j"},
|
||
|
|
{Namespace: "library", Name: "rust"},
|
||
|
|
{Namespace: "library", Name: "oraclelinux"},
|
||
|
|
{Namespace: "library", Name: "openjdk"},
|
||
|
|
{Namespace: "library", Name: "traefik"},
|
||
|
|
{Namespace: "library", Name: "ghost"},
|
||
|
|
{Namespace: "library", Name: "docker"},
|
||
|
|
{Namespace: "library", Name: "websphere-liberty"},
|
||
|
|
{Namespace: "library", Name: "open-liberty"},
|
||
|
|
{Namespace: "library", Name: "storm"},
|
||
|
|
{Namespace: "library", Name: "swift"},
|
||
|
|
{Namespace: "library", Name: "rocket.chat"},
|
||
|
|
{Namespace: "library", Name: "odoo"},
|
||
|
|
{Namespace: "library", Name: "busybox"},
|
||
|
|
{Namespace: "library", Name: "nats"},
|
||
|
|
{Namespace: "library", Name: "mageia"},
|
||
|
|
{Namespace: "library", Name: "tomcat"},
|
||
|
|
{Namespace: "library", Name: "perl"},
|
||
|
|
{Namespace: "library", Name: "xwiki"},
|
||
|
|
{Namespace: "library", Name: "cassandra"},
|
||
|
|
{Namespace: "library", Name: "varnish"},
|
||
|
|
{Namespace: "library", Name: "ibm-semeru-runtimes"},
|
||
|
|
{Namespace: "library", Name: "archlinux"},
|
||
|
|
{Namespace: "library", Name: "clojure"},
|
||
|
|
{Namespace: "library", Name: "maven"},
|
||
|
|
{Namespace: "library", Name: "buildpack-deps"},
|
||
|
|
{Namespace: "library", Name: "solr"},
|
||
|
|
{Namespace: "library", Name: "groovy"},
|
||
|
|
{Namespace: "library", Name: "phpmyadmin"},
|
||
|
|
{Namespace: "library", Name: "hylang"},
|
||
|
|
{Namespace: "library", Name: "joomla"},
|
||
|
|
{Namespace: "library", Name: "matomo"},
|
||
|
|
{Namespace: "library", Name: "drupal"},
|
||
|
|
{Namespace: "library", Name: "yourls"},
|
||
|
|
{Namespace: "library", Name: "haproxy"},
|
||
|
|
{Namespace: "library", Name: "elixir"},
|
||
|
|
{Namespace: "library", Name: "geonetwork"},
|
||
|
|
{Namespace: "library", Name: "convertigo"},
|
||
|
|
{Namespace: "library", Name: "erlang"},
|
||
|
|
{Namespace: "library", Name: "azul-zulu"},
|
||
|
|
{Namespace: "library", Name: "kibana"},
|
||
|
|
{Namespace: "library", Name: "percona"},
|
||
|
|
{Namespace: "library", Name: "logstash"},
|
||
|
|
{Namespace: "library", Name: "elasticsearch"},
|
||
|
|
{Namespace: "library", Name: "krakend"},
|
||
|
|
{Namespace: "library", Name: "postfixadmin"},
|
||
|
|
{Namespace: "library", Name: "monica"},
|
||
|
|
{Namespace: "library", Name: "friendica"},
|
||
|
|
{Namespace: "library", Name: "sapmachine"},
|
||
|
|
{Namespace: "library", Name: "dart"},
|
||
|
|
{Namespace: "library", Name: "spiped"},
|
||
|
|
{Namespace: "library", Name: "amazoncorreto"},
|
||
|
|
{Namespace: "library", Name: "zookeeper"},
|
||
|
|
{Namespace: "library", Name: "julia"},
|
||
|
|
{Namespace: "library", Name: "gcc"},
|
||
|
|
{Namespace: "library", Name: "ibmjava"},
|
||
|
|
{Namespace: "library", Name: "mediawiki"},
|
||
|
|
{Namespace: "library", Name: "couchbase"},
|
||
|
|
{Namespace: "library", Name: "jetty"},
|
||
|
|
{Namespace: "library", Name: "sparl"},
|
||
|
|
{Namespace: "library", Name: "tomee"},
|
||
|
|
{Namespace: "library", Name: "kapacitor"},
|
||
|
|
{Namespace: "library", Name: "ros"},
|
||
|
|
{Namespace: "library", Name: "silverpeas"},
|
||
|
|
{Namespace: "library", Name: "jruby"},
|
||
|
|
{Namespace: "library", Name: "neurodebian"},
|
||
|
|
{Namespace: "library", Name: "flink"},
|
||
|
|
{Namespace: "library", Name: "pypy"},
|
||
|
|
{Namespace: "library", Name: "orientdb"},
|
||
|
|
{Namespace: "library", Name: "liquidbase"},
|
||
|
|
{Namespace: "library", Name: "haxe"},
|
||
|
|
{Namespace: "library", Name: "r-base"},
|
||
|
|
{Namespace: "library", Name: "lighstreamer"},
|
||
|
|
{Namespace: "library", Name: "kong"},
|
||
|
|
{Namespace: "library", Name: "aerospike"},
|
||
|
|
{Namespace: "library", Name: "influxdb"},
|
||
|
|
{Namespace: "library", Name: "irssi"},
|
||
|
|
{Namespace: "library", Name: "rakudo-star"},
|
||
|
|
{Namespace: "library", Name: "satosa"},
|
||
|
|
{Namespace: "library", Name: "rethinkdb"},
|
||
|
|
{Namespace: "library", Name: "chronograf"},
|
||
|
|
{Namespace: "library", Name: "memcached"},
|
||
|
|
{Namespace: "library", Name: "backdrop"},
|
||
|
|
{Namespace: "library", Name: "telegraf"},
|
||
|
|
{Namespace: "library", Name: "httpd"},
|
||
|
|
{Namespace: "library", Name: "haskell"},
|
||
|
|
{Namespace: "library", Name: "emqx"},
|
||
|
|
{Namespace: "library", Name: "swipl"},
|
||
|
|
{Namespace: "library", Name: "couchdb"},
|
||
|
|
{Namespace: "library", Name: "hitch"},
|
||
|
|
{Namespace: "library", Name: "composer"},
|
||
|
|
{Namespace: "library", Name: "adminer"},
|
||
|
|
{Namespace: "library", Name: "amazonlinux"},
|
||
|
|
{Namespace: "library", Name: "bash"},
|
||
|
|
{Namespace: "library", Name: "caddy"},
|
||
|
|
{Namespace: "library", Name: "arangodb"},
|
||
|
|
{Namespace: "library", Name: "bonita"},
|
||
|
|
{Namespace: "library", Name: "photon"},
|
||
|
|
{Namespace: "library", Name: "almalinux"},
|
||
|
|
{Namespace: "library", Name: "teamspeak"},
|
||
|
|
{Namespace: "library", Name: "fedora"},
|
||
|
|
{Namespace: "library", Name: "eclipse-mosquitto"},
|
||
|
|
{Namespace: "library", Name: "registry"},
|
||
|
|
{Namespace: "library", Name: "eggdrop"},
|
||
|
|
{Namespace: "library", Name: "znc"},
|
||
|
|
{Namespace: "library", Name: "api-firewall"},
|
||
|
|
{Namespace: "library", Name: "alt"},
|
||
|
|
{Namespace: "library", Name: "unit"},
|
||
|
|
{Namespace: "library", Name: "clearlinux"},
|
||
|
|
{Namespace: "library", Name: "gazebo"},
|
||
|
|
{Namespace: "library", Name: "mongo-express"},
|
||
|
|
{Namespace: "library", Name: "plone"},
|
||
|
|
{Namespace: "library", Name: "cirros"},
|
||
|
|
{Namespace: "library", Name: "mono"},
|
||
|
|
{Namespace: "library", Name: "nats-streaming"},
|
||
|
|
{Namespace: "library", Name: "sl"},
|
||
|
|
{Namespace: "library", Name: "rockylinux"},
|
||
|
|
{Namespace: "library", Name: "notary"},
|
||
|
|
{Namespace: "library", Name: "vault"},
|
||
|
|
{Namespace: "library", Name: "jobber"},
|
||
|
|
{Namespace: "library", Name: "consul"},
|
||
|
|
{Namespace: "library", Name: "php-zendserver"},
|
||
|
|
{Namespace: "library", Name: "centos"},
|
||
|
|
{Namespace: "library", Name: "express-gateway"},
|
||
|
|
{Namespace: "library", Name: "clefos"},
|
||
|
|
{Namespace: "library", Name: "adoptopenjdk"},
|
||
|
|
{Namespace: "library", Name: "thrift"},
|
||
|
|
{Namespace: "library", Name: "rapidoid"},
|
||
|
|
{Namespace: "library", Name: "kaazing-gateway"},
|
||
|
|
{Namespace: "library", Name: "nuxeo"},
|
||
|
|
{Namespace: "library", Name: "neo4j"},
|
||
|
|
{Namespace: "library", Name: "fsharp"},
|
||
|
|
{Namespace: "library", Name: "sourcemage"},
|
||
|
|
{Namespace: "library", Name: "swarm"},
|
||
|
|
{Namespace: "library", Name: "euleros"},
|
||
|
|
{Namespace: "library", Name: "crux"},
|
||
|
|
{Namespace: "library", Name: "sentry"},
|
||
|
|
{Namespace: "library", Name: "known"},
|
||
|
|
{Namespace: "library", Name: "opensuse"},
|
||
|
|
{Namespace: "library", Name: "owncloud"},
|
||
|
|
{Namespace: "library", Name: "piwik"},
|
||
|
|
{Namespace: "library", Name: "jenkins"},
|
||
|
|
{Namespace: "library", Name: "celery"},
|
||
|
|
{Namespace: "library", Name: "iojs"},
|
||
|
|
{Namespace: "library", Name: "java"},
|
||
|
|
{Namespace: "library", Name: "rails"},
|
||
|
|
{Namespace: "library", Name: "django"},
|
||
|
|
{Namespace: "library", Name: "glassfish"},
|
||
|
|
{Namespace: "library", Name: "hipache"},
|
||
|
|
{Namespace: "library", Name: "ubuntu-upstart"},
|
||
|
|
{Namespace: "library", Name: "ubuntu-debootstrap"},
|
||
|
|
{Namespace: "library", Name: "docker-dev"},
|
||
|
|
{Namespace: "library", Name: "scratch"},
|
||
|
|
}
|
||
|
|
|
||
|
|
// scraperConfigFromEnv reads scraper configuration from environment variables
|
||
|
|
// and returns a populated scraperConfig with sensible defaults.
|
||
|
|
func scraperConfigFromEnv() scraperConfig {
|
||
|
|
cfg := scraperConfig{
|
||
|
|
Enabled: true,
|
||
|
|
MaxTags: 10,
|
||
|
|
IntervalHours: 24,
|
||
|
|
Images: defaultImages,
|
||
|
|
}
|
||
|
|
|
||
|
|
if v := os.Getenv("DOCKER_SCRAPER_ENABLED"); v == "false" {
|
||
|
|
cfg.Enabled = false
|
||
|
|
}
|
||
|
|
if v := os.Getenv("DOCKER_SCRAPER_MAX_TAGS"); v != "" {
|
||
|
|
if n, err := strconv.Atoi(v); err == nil && n > 0 {
|
||
|
|
cfg.MaxTags = n
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if v := os.Getenv("DOCKER_SCRAPER_INTERVAL_H"); v != "" {
|
||
|
|
if h, err := strconv.Atoi(v); err == nil && h > 0 {
|
||
|
|
cfg.IntervalHours = h
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if v := os.Getenv("DOCKER_SCRAPER_IMAGES"); v != "" {
|
||
|
|
var specs []DockerImageSpec
|
||
|
|
for _, raw := range strings.Split(v, ",") {
|
||
|
|
raw = strings.TrimSpace(raw)
|
||
|
|
if raw == "" {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
if parts := strings.SplitN(raw, "/", 2); len(parts) == 2 {
|
||
|
|
specs = append(specs, DockerImageSpec{Namespace: parts[0], Name: parts[1]})
|
||
|
|
} else {
|
||
|
|
specs = append(specs, DockerImageSpec{Namespace: "library", Name: raw})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if len(specs) > 0 {
|
||
|
|
cfg.Images = specs
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return cfg
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Docker Hub API types ──────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
type hubRepoInfo struct {
|
||
|
|
Description string `json:"description"`
|
||
|
|
FullDescription string `json:"full_description"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type hubTagImage struct {
|
||
|
|
Architecture string `json:"architecture"`
|
||
|
|
OS string `json:"os"`
|
||
|
|
Digest string `json:"digest"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type hubTag struct {
|
||
|
|
Name string `json:"name"`
|
||
|
|
FullSize int64 `json:"full_size"`
|
||
|
|
LastUpdated string `json:"last_updated"`
|
||
|
|
Images []hubTagImage `json:"images"`
|
||
|
|
}
|
||
|
|
|
||
|
|
type hubTagsResponse struct {
|
||
|
|
Count int `json:"count"`
|
||
|
|
Results []hubTag `json:"results"`
|
||
|
|
}
|
||
|
|
|
||
|
|
// reMarkdownImage matches the first Markdown image in a string, e.g. 
|
||
|
|
var reMarkdownImage = regexp.MustCompile(`!\[[^\]]*\]\((https?://[^)]+)\)`)
|
||
|
|
|
||
|
|
// extractLogoURL returns the first image URL found in a Markdown string, or "".
|
||
|
|
func extractLogoURL(markdown string) string {
|
||
|
|
if m := reMarkdownImage.FindStringSubmatch(markdown); len(m) == 2 {
|
||
|
|
return m[1]
|
||
|
|
}
|
||
|
|
return ""
|
||
|
|
}
|
||
|
|
|
||
|
|
// fetchJSON performs a GET request to url and decodes the JSON body into out.
|
||
|
|
func fetchJSON(url string, out interface{}) error {
|
||
|
|
resp, err := http.Get(url) //nolint:noctx
|
||
|
|
if err != nil {
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
defer resp.Body.Close()
|
||
|
|
if resp.StatusCode != http.StatusOK {
|
||
|
|
return fmt.Errorf("HTTP %d for %s", resp.StatusCode, url)
|
||
|
|
}
|
||
|
|
body, err := io.ReadAll(resp.Body)
|
||
|
|
if err != nil {
|
||
|
|
return err
|
||
|
|
}
|
||
|
|
return json.Unmarshal(body, out)
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Entry point ──────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
// StartDockerScraper starts the background Docker Hub scraper goroutine.
|
||
|
|
// It runs a full scrape immediately at startup, then repeats on the configured
|
||
|
|
// interval. This function blocks forever and is designed to be called via
|
||
|
|
// `go infrastructure.StartDockerScraper()` from main().
|
||
|
|
func StartDockerScraper() {
|
||
|
|
cfg := scraperConfigFromEnv()
|
||
|
|
if !cfg.Enabled {
|
||
|
|
fmt.Println("[docker-scraper] disabled (DOCKER_SCRAPER_ENABLED=false)")
|
||
|
|
return
|
||
|
|
}
|
||
|
|
fmt.Printf("[docker-scraper] started — images=%d maxTags=%d interval=%dh\n",
|
||
|
|
len(cfg.Images), cfg.MaxTags, cfg.IntervalHours)
|
||
|
|
|
||
|
|
runScrape(cfg)
|
||
|
|
|
||
|
|
ticker := time.NewTicker(time.Duration(cfg.IntervalHours) * time.Hour)
|
||
|
|
defer ticker.Stop()
|
||
|
|
for range ticker.C {
|
||
|
|
runScrape(cfg)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// runScrape executes one full scrape cycle for all configured images.
|
||
|
|
func runScrape(cfg scraperConfig) {
|
||
|
|
fmt.Printf("[docker-scraper] cycle started at %s\n", time.Now().Format(time.RFC3339))
|
||
|
|
for _, spec := range cfg.Images {
|
||
|
|
if err := scrapeImage(spec, cfg.MaxTags); err != nil {
|
||
|
|
fmt.Printf("[docker-scraper] %s/%s: %v\n", spec.Namespace, spec.Name, err)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
fmt.Printf("[docker-scraper] cycle done at %s\n", time.Now().Format(time.RFC3339))
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Per-image scraping ────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
// scrapeImage fetches Docker Hub metadata for one repository, then either creates
|
||
|
|
// a new ProcessingResource in the catalog or extends the existing one with any
|
||
|
|
// missing tag-instances.
|
||
|
|
func scrapeImage(spec DockerImageSpec, maxTags int) error {
|
||
|
|
// ── Fetch image metadata ──────────────────────────────────────────────────
|
||
|
|
var info hubRepoInfo
|
||
|
|
repoURL := fmt.Sprintf("https://hub.docker.com/v2/repositories/%s/%s/",
|
||
|
|
spec.Namespace, spec.Name)
|
||
|
|
if err := fetchJSON(repoURL, &info); err != nil {
|
||
|
|
return fmt.Errorf("fetch repo info: %w", err)
|
||
|
|
}
|
||
|
|
|
||
|
|
// ── Fetch tags ────────────────────────────────────────────────────────────
|
||
|
|
tagsURL := fmt.Sprintf(
|
||
|
|
"https://hub.docker.com/v2/repositories/%s/%s/tags?page_size=%d&ordering=last_updated",
|
||
|
|
spec.Namespace, spec.Name, maxTags)
|
||
|
|
var tagsResp hubTagsResponse
|
||
|
|
if err := fetchJSON(tagsURL, &tagsResp); err != nil {
|
||
|
|
return fmt.Errorf("fetch tags: %w", err)
|
||
|
|
}
|
||
|
|
if len(tagsResp.Results) == 0 {
|
||
|
|
return nil // nothing to upsert
|
||
|
|
}
|
||
|
|
|
||
|
|
adminReq := &tools.APIRequest{Admin: true}
|
||
|
|
accessor := (&resources.ProcessingResource{}).GetAccessor(adminReq)
|
||
|
|
|
||
|
|
resourceName := spec.resourceName()
|
||
|
|
existing := findProcessingResourceByName(accessor, resourceName)
|
||
|
|
|
||
|
|
if existing == nil {
|
||
|
|
return createDockerProcessingResource(accessor, spec, resourceName, info, tagsResp.Results)
|
||
|
|
}
|
||
|
|
return syncDockerInstances(accessor, existing, spec, tagsResp.Results)
|
||
|
|
}
|
||
|
|
|
||
|
|
// resourceName returns the canonical catalog name for a DockerImageSpec.
|
||
|
|
// Official (library) images use just the image name; others use "org/image".
|
||
|
|
func (s DockerImageSpec) resourceName() string {
|
||
|
|
if s.Namespace == "library" {
|
||
|
|
return s.Name
|
||
|
|
}
|
||
|
|
return s.Namespace + "/" + s.Name
|
||
|
|
}
|
||
|
|
|
||
|
|
// dockerRef builds the canonical pull reference for an image+tag pair.
|
||
|
|
func dockerRef(spec DockerImageSpec, tag string) string {
|
||
|
|
if spec.Namespace == "library" {
|
||
|
|
return "docker.io/" + spec.Name + ":" + tag
|
||
|
|
}
|
||
|
|
return "docker.io/" + spec.Namespace + "/" + spec.Name + ":" + tag
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── DB helpers ───────────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
// findProcessingResourceByName loads all ProcessingResources (both draft and
|
||
|
|
// published) and returns the first whose name matches exactly.
|
||
|
|
func findProcessingResourceByName(accessor utils.Accessor, name string) *resources.ProcessingResource {
|
||
|
|
filters := &dbs.Filters{
|
||
|
|
Or: map[string][]dbs.Filter{
|
||
|
|
"abstractresource.abstractobject.name": {{
|
||
|
|
Operator: dbs.LIKE.String(),
|
||
|
|
Value: name,
|
||
|
|
}},
|
||
|
|
"abstractobject.name": {{
|
||
|
|
Operator: dbs.LIKE.String(),
|
||
|
|
Value: name,
|
||
|
|
}},
|
||
|
|
},
|
||
|
|
}
|
||
|
|
for _, draft := range []bool{false, true} {
|
||
|
|
results, _, _ := accessor.Search(filters, "", draft)
|
||
|
|
for _, r := range results {
|
||
|
|
if pr, ok := r.(*resources.ProcessingResource); ok && pr.GetName() == name {
|
||
|
|
return pr
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// createDockerProcessingResource stores a brand-new ProcessingResource with one
|
||
|
|
// peerless instance per Docker Hub tag, then publishes it (IsDraft = false).
|
||
|
|
func createDockerProcessingResource(
|
||
|
|
accessor utils.Accessor,
|
||
|
|
spec DockerImageSpec,
|
||
|
|
name string,
|
||
|
|
info hubRepoInfo,
|
||
|
|
tags []hubTag,
|
||
|
|
) error {
|
||
|
|
resource := &resources.ProcessingResource{
|
||
|
|
AbstractInstanciatedResource: resources.AbstractInstanciatedResource[*resources.ProcessingInstance]{
|
||
|
|
AbstractResource: resources.AbstractResource{
|
||
|
|
AbstractObject: utils.AbstractObject{
|
||
|
|
UUID: uuid.New().String(),
|
||
|
|
Name: name,
|
||
|
|
},
|
||
|
|
Description: info.FullDescription,
|
||
|
|
ShortDescription: info.Description,
|
||
|
|
Logo: extractLogoURL(info.FullDescription),
|
||
|
|
Owners: []utils.Owner{
|
||
|
|
{Name: "https://hub.docker.com/", Logo: "https://icones8.fr/icon/Wln8Z3PcXanx/logo-docker"},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
Infrastructure: enum.DOCKER,
|
||
|
|
OpenSource: true,
|
||
|
|
IsService: false,
|
||
|
|
}
|
||
|
|
|
||
|
|
for i := range tags {
|
||
|
|
resource.AddInstances(buildPeerlessInstance(spec, tags[i]))
|
||
|
|
}
|
||
|
|
|
||
|
|
// StoreOne goes through GenericStoreOne which calls AbstractResource.StoreDraftDefault()
|
||
|
|
// setting IsDraft=true. We then publish with a raw update.
|
||
|
|
stored, _, err := accessor.StoreOne(resource)
|
||
|
|
if err != nil {
|
||
|
|
return fmt.Errorf("store %q: %w", name, err)
|
||
|
|
}
|
||
|
|
pr := stored.(*resources.ProcessingResource)
|
||
|
|
pr.IsDraft = false
|
||
|
|
if _, _, err := utils.GenericRawUpdateOne(pr, pr.GetID(), accessor); err != nil {
|
||
|
|
return fmt.Errorf("publish %q: %w", name, err)
|
||
|
|
}
|
||
|
|
fmt.Printf("[docker-scraper] created %q with %d instances\n", name, len(tags))
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// syncDockerInstances adds to an existing ProcessingResource any tag-instances
|
||
|
|
// that are not yet present (identified by Origin.Ref). Already-present tags
|
||
|
|
// are left untouched to preserve any manually enriched metadata.
|
||
|
|
func syncDockerInstances(
|
||
|
|
accessor utils.Accessor,
|
||
|
|
resource *resources.ProcessingResource,
|
||
|
|
spec DockerImageSpec,
|
||
|
|
tags []hubTag,
|
||
|
|
) error {
|
||
|
|
existing := map[string]bool{}
|
||
|
|
for _, inst := range resource.Instances {
|
||
|
|
existing[inst.GetOrigin().Ref] = true
|
||
|
|
}
|
||
|
|
|
||
|
|
added := 0
|
||
|
|
for i := range tags {
|
||
|
|
ref := dockerRef(spec, tags[i].Name)
|
||
|
|
if existing[ref] {
|
||
|
|
continue
|
||
|
|
}
|
||
|
|
resource.AddInstances(buildPeerlessInstance(spec, tags[i]))
|
||
|
|
added++
|
||
|
|
}
|
||
|
|
if added == 0 {
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
if _, _, err := utils.GenericRawUpdateOne(resource, resource.GetID(), accessor); err != nil {
|
||
|
|
return fmt.Errorf("sync instances for %q: %w", resource.GetName(), err)
|
||
|
|
}
|
||
|
|
fmt.Printf("[docker-scraper] added %d new instances to %q\n", added, resource.GetName())
|
||
|
|
return nil
|
||
|
|
}
|
||
|
|
|
||
|
|
// ─── Instance builder ─────────────────────────────────────────────────────────
|
||
|
|
|
||
|
|
// buildPeerlessInstance creates a ProcessingInstance that satisfies
|
||
|
|
// ResourceInstance.IsPeerless():
|
||
|
|
//
|
||
|
|
// CreatorID = "" (zero value — no owning peer)
|
||
|
|
// Partnerships = nil (zero value — no partnerships)
|
||
|
|
// Origin.Ref != "" (set to the canonical docker pull reference)
|
||
|
|
//
|
||
|
|
// ProcessingInstance.StoreDraftDefault() enforces this invariant on write.
|
||
|
|
func buildPeerlessInstance(spec DockerImageSpec, tag hubTag) *resources.ProcessingInstance {
|
||
|
|
ref := dockerRef(spec, tag.Name)
|
||
|
|
|
||
|
|
// Collect architecture hint from the first image manifest entry (if any).
|
||
|
|
arch := ""
|
||
|
|
if len(tag.Images) > 0 {
|
||
|
|
arch = tag.Images[0].Architecture
|
||
|
|
}
|
||
|
|
|
||
|
|
return &resources.ProcessingInstance{
|
||
|
|
ResourceInstance: resources.ResourceInstance[*resources.ResourcePartnerShip[*resources.ProcessingResourcePricingProfile]]{
|
||
|
|
AbstractObject: utils.AbstractObject{
|
||
|
|
UUID: uuid.New().String(),
|
||
|
|
Name: tag.Name,
|
||
|
|
// CreatorID intentionally left empty — required for IsPeerless()
|
||
|
|
},
|
||
|
|
Origin: resources.OriginMeta{
|
||
|
|
Type: resources.OriginPublic,
|
||
|
|
Ref: ref,
|
||
|
|
License: "", // filled in per-image if known (e.g. MIT, Apache-2.0)
|
||
|
|
Verified: true, // official Docker Hub images are considered verified
|
||
|
|
},
|
||
|
|
// Env / Inputs / Outputs left empty — can be enriched manually or by
|
||
|
|
// future scrapers that read image labels / Docker Hub documentation.
|
||
|
|
},
|
||
|
|
Access: &resources.ProcessingResourceAccess{
|
||
|
|
Container: &models.Container{
|
||
|
|
Image: ref,
|
||
|
|
// Command, Args, Env, Volumes left empty — image defaults apply.
|
||
|
|
Env: map[string]string{
|
||
|
|
"ARCH": arch,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
},
|
||
|
|
}
|
||
|
|
}
|