Source code for provenance.repos

import copy
import json
import multiprocessing
import operator as ops
import os
import platform
from collections import namedtuple
from contextlib import contextmanager
from datetime import datetime

import numpy as np
import psutil
import sqlalchemy
from sqlalchemy.schema import CreateSchema
import sqlalchemy.sql as sa
import sqlalchemy.dialects.postgresql as pg
import sqlalchemy.orm
import toolz as t
import wrapt
from alembic import command
from alembic.config import Config as AlembicConfig
from alembic import context as alembic_context
from alembic.migration import MigrationContext

import sqlalchemy_utils.functions as sql_utils
from memoized_property import memoized_property

from . import _commonstore as cs
from ._commonstore import find_first
from . import models as db
from . import models
from . import serializers as s
from . import utils
from .compatibility import string_type
from .hashing import hash, value_repr


def _host_info():
    return {'machine': platform.machine(),
            'nodename': platform.node(),
            'platform': platform.platform(),
            'processor': platform.processor(),
            'cpu_count': multiprocessing.cpu_count(),
            'release': platform.release(),
            'system': platform.system(),
            'version': platform.version()}


def _process_info():
    pid = os.getpid()
    p = psutil.Process(pid)
    return {'cmdline': p.cmdline(),
            'cwd': p.cwd(),
            'exe': p.exe(),
            'name': p.name(),
            'num_fds': p.num_fds(),
            'num_threads': p.num_threads()}


def _alembic_config(connection):
    root = t.pipe(__file__, os.path.realpath, os.path.dirname)
    config = AlembicConfig(os.path.join(root, 'alembic.ini'))
    config.set_main_option('script_location',
                           os.path.join(root, 'migrations'))
    config.attributes['connection'] = connection
    return config


def _create_db_if_needed(db_conn_str):
    if not sql_utils.database_exists(db_conn_str):
        sql_utils.create_database(db_conn_str)


class Config(object):

    _current = None

    @classmethod
    def current(cls):
        return cls._current

    @classmethod
    def set_current(cls, registry):
        cls._current = registry

    def __init__(self, blobstores, repos, default_repo, run_info_fn=None, use_cache=True, check_mutations=False):
        self.blobstores = blobstores
        self.repos = repos
        self.set_default_repo(default_repo)
        self._run_info = None
        self.run_info_fn = run_info_fn or t.identity
        self.use_cache = use_cache
        self.check_mutations = check_mutations

    def set_default_repo(self, repo):
        if isinstance(repo, string_type):
            if repo not in self.repos:
                raise Exception("There is no registered repo named '{}'.".format(repo))
            self.default_repo = self.repos[repo]
        else:
            self.default_repo = repo

    def set_run_info_fn(self, fn):
        self.run_info_fn = fn or t.identity
        self._run_info = None

    def run_info(self):
        if self._run_info is None:
            run_info = self.run_info_fn(
                {'host': _host_info(), 'process': _process_info(),
                 'created_at': datetime.utcnow()})
            run_info['id'] = hash(run_info)
            self._run_info = run_info
        return self._run_info


Config.set_current(Config({}, {}, None))


[docs]def current_config(): return Config.current()
[docs]def set_default_repo(repo_or_name): current_config().set_default_repo(repo_or_name)
[docs]def get_default_repo(): return current_config().default_repo
[docs]def set_run_info_fn(fn): """ This hook allows you to provide a function that will be called once with a process's `run_info` default dictionary. The provided function can then update this dictionary with other useful information you wish to track, such as git ref or build server id. """ current_config().set_run_info_fn(fn)
[docs]def get_use_cache(): return current_config().use_cache
[docs]def set_use_cache(setting): current_config().use_cache = setting
[docs]def get_check_mutations(): return current_config().check_mutations
[docs]def set_check_mutations(setting): current_config().check_mutations = setting
[docs]def get_repo_by_name(repo_name): return current_config().repos[repo_name]
[docs]@contextmanager def using_repo(repo_or_name): prev_repo = get_default_repo() set_default_repo(repo_or_name) try: yield finally: set_default_repo(prev_repo)
[docs]def load_artifact(artifact_id): """Loads and returns the ``Artifact`` with the ``artifact_id`` from the default repo. Parameters ---------- artifact_id : string See Also -------- load_proxy """ return get_default_repo().get_by_id(artifact_id)
[docs]def load_proxy(artifact_id): """Loads and returns the ``ArtifactProxy`` with the ``artifact_id`` from the default repo. Parameters ---------- artifact_id : string See Also -------- load_artifact """ return get_default_repo().get_by_id(artifact_id).proxy()
[docs]def load_set_by_id(set_id): """Loads and returns the ``ArtifactSet`` with the ``set_id`` from the default repo. Parameters ---------- set_id : string See Also -------- load_set_by_name """ return get_default_repo().get_set_by_id(set_id)
def load_set_by_labels(labels): """Loads and returns the ``ArtifactSet`` with the ``labels`` from the default repo. Parameters ---------- labels : string or dictionary See Also -------- load_set_by_id load_set_by_labels """ return get_default_repo().get_set_by_labels(labels)
[docs]def load_set_by_name(set_name): """Loads and returns the ``ArtifactSet`` with the ``set_name`` from the default repo. Parameters ---------- set_name : string See Also -------- load_set_by_id load_set_by_labels """ return get_default_repo().get_set_by_labels({'name': set_name})
def _check_labels_name(labels): if isinstance(labels, str): return {'name': labels} return labels
[docs]def create_set(artifact_ids, labels=None): labels = _check_labels_name(labels) return ArtifactSet(artifact_ids, labels).put()
def label_set(artifact_set_or_id, labels): repo = get_default_repo() if isinstance(artifact_set_or_id, ArtifactSet): artifact_set = artifact_set_or_id else: artifact_set = repo.get_set_by_id(artifact_set_or_id) return artifact_set.relabel(labels).put(repo) def transform_value(proxy_artifact, transformer_fn): """ Transforms the underlying value of the ``proxy_artifact`` with the provided ``transformer_fn``. A new ``ArtifactProxy`` is returned with the transformed value but with the original artifact. The motivation behind this function is to allow archived files to be loaded into memory and passed around while preserving the provenance of the artifact. It could be used in any other situation where you want a different representation of an artifact value while allowing the provenance to be tracked. Care should be taken when using this function however because it will prevent you from reproducing exact artifacts from the lineage since this transformer_fn will not be tracked. """ transformed = copy.copy(proxy_artifact) transformed.__wrapped__ = transformer_fn(transformed) return transformed class Proxy(): def value_repr(self): return value_repr(self.artifact.value) def transform_value(self, transformer_fn): return transform_value(self, transformer_fn) def __next__(self): # note, that every proxy is always identified as an # iterable anyways: https://github.com/GrahamDumpleton/wrapt/issues/93 # this just forwards the method to the wrapped version since # it wasn't doing that for somereason return next(self.__wrapped__) class ArtifactProxy(wrapt.ObjectProxy, Proxy): def __init__(self, value, artifact): super(ArtifactProxy, self).__init__(value) self._self_artifact = artifact @property def artifact(self): return self._self_artifact def __repr__(self): return '<provenance.ArtifactProxy({}) {} >'.\ format(self.artifact.id, repr(self.__wrapped__)) def __reduce__(self): return (load_proxy , (self.artifact.id,)) class CallableArtifactProxy(wrapt.CallableObjectProxy, Proxy): def __init__(self, value, artifact): super(CallableArtifactProxy, self).__init__(value) self._self_artifact = artifact @property def artifact(self): return self._self_artifact def __repr__(self): return '<provenance.ArtifactProxy({}) {} >'.\ format(self.artifact.id, repr(self.__wrapped__)) def __reduce__(self): return (load_proxy, (self.artifact.id,)) def artifact_proxy(value, artifact): if callable(value): return CallableArtifactProxy(value, artifact) return ArtifactProxy(value, artifact)
[docs]def is_proxy(obj): return (type(obj) == ArtifactProxy or type(obj) == CallableArtifactProxy)
class Artifact(object): def __init__(self, repo, props, value=None, inputs=None, run_info=None): assert ('id' in props), "props must contain 'id'" assert ('value_id' in props), "props must contain 'value_id'" self.__dict__ = props.copy() self.repo = repo if value is not None: self._value = value if inputs is not None: self._inputs = inputs if run_info is not None: self._run_info = run_info @memoized_property def value(self): return self.repo.get_value(self) @memoized_property def inputs(self): return self.repo.get_inputs(self) @memoized_property def run_info(self): return self.repo.run_info(self.id) @property def tags(self): if self.custom_fields: return self.custom_fields.get('tags', None) def proxy(self): if self.composite: value = lazy_dict(t.valmap(lambda a: lambda: a.proxy(), self.value)) else: value = self.value return artifact_proxy(value, self) def __eq__(self, other): if isinstance(other, self.__class__): return self.id == other.id return NotImplemented def __ne__(self, other): return not self.__eq__(other) def __hash__(self): return hash(('--artifact--', self.id)) def __repr__(self): return '<provenance.Artifact({})>'.format(self.id) def __reduce__(self): return (load_artifact, (self.id,)) @value_repr.register(Artifact) def _(artifact): return ('artifact', artifact.id) def _artifact_id(artifact_or_id): if isinstance(artifact_or_id, string_type): return artifact_or_id if hasattr(artifact_or_id, 'id'): return artifact_or_id.id if hasattr(artifact_or_id, 'artifact'): return artifact_or_id.artifact.id raise Exception("Unable to coerce into an artifact id: {}".\ format(artifact_or_id)) def _artifact_from_record(repo, record): if isinstance(record, Artifact): return record return Artifact(repo, t.dissoc(record._asdict(), 'value', 'inputs', 'run_info'), value=record.value, inputs=record.inputs, run_info=record.run_info) class ArtifactRepository(object): def __init__(self, read=True, write=True, read_through_write=True, delete=False): self._read = read self._write = write self._read_through_write = read_through_write self._delete = delete def __getitem__(self, artifact_id): return self.get_by_id(artifact_id) def batch_get_by_id(self, artifact_ids): cs.ensure_read(self) return [self.get_by_id(id) for id in artifact_ids] class MemoryRepo(ArtifactRepository): def __init__(self, artifacts=None, read=True, write=True, read_through_write=True, delete=True): super(MemoryRepo, self).__init__(read=read, write=write, read_through_write=read_through_write, delete=delete) self.artifacts = artifacts if artifacts else [] self.sets = [] def __contains__(self, artifact_or_id): cs.ensure_contains(self) artifact_id = _artifact_id(artifact_or_id) if find_first(lambda a: a.id == artifact_id, self.artifacts): return True else: return False def put(self, record, read_through=False): artifact_id = _artifact_id(record) cs.ensure_put(self, artifact_id, read_through) self.artifacts.append(record) return _artifact_from_record(self, record) def get_by_id(self, artifact_id): cs.ensure_read(self) record = find_first(lambda a: a.id == artifact_id, self.artifacts) if record: return _artifact_from_record(self, record) else: raise KeyError(artifact_id, self) def get_by_value_id(self, value_id): cs.ensure_read(self) record = find_first(lambda a: a.value_id == value_id, self.artifacts) if record: return _artifact_from_record(self, record) else: raise KeyError(value_id, self) def get_value(self, artifact, composite=False): cs.ensure_read(self) return find_first(lambda a: a.id == artifact.id, self.artifacts).value def get_inputs(self, artifact): cs.ensure_read(self) return find_first(lambda a: a.id == artifact.id, self.artifacts).inputs def delete(self, artifact_or_id): artifact_id = _artifact_id(artifact_or_id) cs.ensure_delete(self) new_artifacts = list(t.filter(lambda a: a.id != artifact_id, self.artifacts)) if len(new_artifacts) == len(self.artifacts): raise KeyError(artifact_id, self) else: self.artifacts = new_artifacts def contains_set(self, set_id): art_set = find_first(lambda s: s.id == set_id, self.sets) return True if art_set else False def get_set_by_id(self, set_id): cs.ensure_read(self) art_set = find_first(lambda s: s.id == set_id, self.sets) if not art_set: raise KeyError(self, set_id) return art_set def get_set_by_labels(self, labels): cs.ensure_read(self) labels = _check_labels_name(labels) versions = [s for s in self.sets if s.labels == labels] if not versions: raise KeyError(labels, self) return sorted(versions, key=lambda s: s.created_at, reverse=True)[0] def put_set(self, artifact_set, read_through=False): cs.ensure_write(self, 'put_set') self.sets.append(artifact_set) return artifact_set def delete_set(self, set_id): cs.ensure_delete(self, check_contains=False) prev_count = len(self.sets) self.sets = [s for s in self.sets if s.id != set_id] if len(self.sets) == prev_count: raise KeyError(set_id, self) def _transform(val): if isinstance(val, (Artifact)): return {'id': val.id, 'type': 'Artifact', 'name': val.name} elif type(val) in {ArtifactProxy, CallableArtifactProxy}: return {'id': val.artifact.id, 'type': 'ArtifactProxy', 'name': val.artifact.name} else: return val def _inputs_json(inputs): expanded = t.valmap(_transform, inputs['kargs']) expanded['__varargs'] = list(t.map(_transform, inputs['varargs'])) return expanded def _ping_postgres(conn, branch): """ Code taken from example here: http://docs.sqlalchemy.org/en/latest/core/pooling.html#dealing-with-disconnects """ if branch: # "branch" refers to a sub-connection of a connection, # we don't want to bother pinging on these. return # turn off "close with result". This flag is only used with # "connectionless" execution, otherwise will be False in any case save_should_close_with_result = conn.should_close_with_result conn.should_close_with_result = False try: # run a SELECT 1. use a core select() so that # the SELECT of a scalar value without a table is # appropriately formatted for the backend conn.scalar(sqlalchemy.select([1])) except sqlalchemy.exc.DBAPIError as err: # catch SQLAlchemy's DBAPIError, which is a wrapper # for the DBAPI's exception. It includes a .connection_invalidated # attribute which specifies if this connection is a "disconnect" # condition, which is based on inspection of the original exception # by the dialect in use. if err.connection_invalidated: # run the same SELECT again - the connection will re-validate # itself and establish a new connection. The disconnect detection # here also causes the whole connection pool to be invalidated # so that all stale connections are discarded. conn.scalar(sqlalchemy.select([1])) else: raise finally: # restore "close with result" conn.should_close_with_result = save_should_close_with_result def _record_pid(dbapi_connection, connection_record): connection_record.info['pid'] = os.getpid() def _check_pid(dbapi_connection, connection_record, connection_proxy): pid = os.getpid() if connection_record.info['pid'] != pid: connection_record.connection = connection_proxy.connection = None raise sqlalchemy.exc.DisconnectionError( "Connection record belongs to pid %s, " "attempting to check out in pid %s" % (connection_record.info['pid'], pid)) @t.curry def _set_search_path(schema, dbapi_connection, connection_record, connection_proxy): cursor = dbapi_connection.cursor() cursor.execute("SET search_path TO {};".format(schema)) dbapi_connection.commit() cursor.close() def _db_engine(conn_string, schema, persistent_connections=True): poolclass = None if persistent_connections else sqlalchemy.pool.NullPool db_engine = sqlalchemy.create_engine(conn_string, json_serializer=Encoder().encode, poolclass=poolclass) sqlalchemy.event.listens_for(db_engine, "engine_connect")(_ping_postgres) sqlalchemy.event.listens_for(db_engine, "connect")(_record_pid) sqlalchemy.event.listens_for(db_engine, "checkout")(_check_pid) if schema: sqlalchemy.event.listens_for(db_engine, "checkout")(_set_search_path(schema)) return db_engine def _insert_set_members_sql(artifact_set): pairs = [(artifact_set.id, id) for id in artifact_set.artifact_ids] return """ INSERT INTO artifact_set_members (set_id, artifact_id) VALUES {} ON CONFLICT DO NOTHING """.strip().format(",\n".join(t.map(str,pairs))) class Encoder(json.JSONEncoder): def default(self, val): if isinstance(val, (datetime)): return str(val) elif isinstance(val, np.integer): return int(val) elif isinstance(val, np.floating): return float(val) elif isinstance(val, np.bool_): return bool(val) elif isinstance(val, np.ndarray): return val.tolist() elif is_proxy(val) or isinstance(val, Artifact): return repr(val) elif callable(val): try: return utils.fn_info(val) except: pass else: try: return super(Encoder, self).default(val) except Exception as e: print("Could not serialize type: {}".format(type(val))) return str(type(val)) class PostgresRepo(ArtifactRepository): # TODO: add the upgrade_db param back once upgrade is working # upgrade_db=True def __init__(self, db, store, read=True, write=True, read_through_write=True, delete=True, create_db=False, schema=None, create_schema=True, persistent_connections=True): upgrade_db=False super(PostgresRepo, self).__init__(read=read, write=write, read_through_write=read_through_write, delete=delete) if not isinstance(db, string_type) and schema is not None: raise ValueError("You can only provide a schema with a DB url.") init_db = False if create_db and isinstance(db, string_type): _create_db_if_needed(db) init_db = True upgrade_db = False self._run = None if isinstance(db, string_type): if create_db: init_db = True self._db_engine = _db_engine(db, schema, persistent_connections) self._sessionmaker = sqlalchemy.orm.sessionmaker(bind=self._db_engine) else: self._session = db if create_schema and schema is not None: with self.session() as session: q = sa.exists(sa.select([("schema_name")]).select_from(sa.text("information_schema.schemata")) .where(sa.text("schema_name = :schema") .bindparams(schema=schema))) if not session.query(q).scalar(): session.execute(CreateSchema(schema)) session.commit() init_db = True upgrade_db = False if init_db: self._db_init() if upgrade_db: self._db_upgrade() self.blobstore = store @contextmanager def session(self): if hasattr(self, '_session'): close = False else: self._session = self._sessionmaker() close = True try: yield self._session except: self._session.rollback() raise finally: if close: self._session.close() del self._session def __contains__(self, artifact_or_id): cs.ensure_contains(self) artifact_id = _artifact_id(artifact_or_id) with self.session() as s: return s.query(db.Artifact).filter(db.Artifact.id == artifact_id).count() > 0 def _upsert_run(self, session, info): sql = pg.insert(db.Run).values( id=info['id'], info=info, hostname=info['host']['nodename'], created_at=info['created_at'] ).on_conflict_do_nothing(index_elements=['id']) session.execute(sql) return db.Run(info) @property def db_revision(self): with self.session() as session: context = MigrationContext.configure(session.connection()) return context.get_current_revision() def _db_init(self): db.Base.metadata.create_all(self._db_engine) with self.session() as session: conn = session.connection() # the below doesn't work for some reason # db.Base.metadata.create_all(conn) cfg = _alembic_config(conn) command.stamp(cfg, "head") def _db_upgrade(self): with self.session() as session: conn = session.connection() cfg = _alembic_config(conn) command.upgrade(cfg, "head") def put(self, artifact_record, read_through=False): with self.session() as session: cs.ensure_put(self, artifact_record.id, read_through) self.blobstore.put(artifact_record.id, artifact_record.inputs, s.DEFAULT_INPUT_SERIALIZER) self.blobstore.put(artifact_record.value_id, artifact_record.value, s.serializer(artifact_record)) inputs_json = _inputs_json(artifact_record.inputs) run = self._upsert_run(session, artifact_record.run_info) db_artifact = db.Artifact(artifact_record, inputs_json, run) session.add(db_artifact) session.commit() return _artifact_from_record(self, artifact_record) def get_by_id(self, artifact_id): cs.ensure_read(self) with self.session() as session: result = session.query(db.Artifact).filter(db.Artifact.id == artifact_id).first() if result: return Artifact(self, result.props) else: raise KeyError(artifact_id, self) def batch_get_by_id(self, artifact_ids): cs.ensure_read(self) with self.session() as session: results = session.query(db.Artifact).filter(db.Artifact.id.in_(artifact_ids)).all() if len(results) == len(artifact_ids): return [Artifact(self, result.props) for result in results] else: ids = set(artifact_ids) found = set([a.id for a in results]) missing = ids - found raise KeyError(missing, self) def get_by_value_id(self, value_id): cs.ensure_read(self) with self.session() as session: result = session.query(db.Artifact).filter(db.Artifact.value_id == value_id).first() if result: return Artifact(self, result.props) else: raise KeyError(value_id, self) def get_value(self, artifact): cs.ensure_read(self) return self.blobstore.get(artifact.value_id, s.serializer(artifact)) def get_inputs(self, artifact): cs.ensure_read(self) return self.blobstore.get(artifact.id, s.DEFAULT_INPUT_SERIALIZER) def delete(self, artifact_or_id): with self.session() as session: cs.ensure_delete(self) artifact = self.get_by_id(artifact_or_id) (session.query(db.Artifact). filter(db.Artifact.id == artifact.id).delete()) self.blobstore.delete(artifact.id) self.blobstore.delete(artifact.value_id) session.commit() def put_set(self, artifact_set, read_through=False): with self.session() as session: cs.ensure_write(self, 'put_set') db_set = db.ArtifactSet(artifact_set) session.add(db_set) session.execute(_insert_set_members_sql(artifact_set)) session.commit() return artifact_set def _db_to_mem_set(self, result): with self.session() as session: members = (session.query(db.ArtifactSetMember) .filter(db.ArtifactSetMember.set_id == result.set_id) .all()) props = result.props props['artifact_ids'] = [m.artifact_id for m in members] return ArtifactSet(**props) def contains_set(self, set_id): with self.session() as session: if (session.query(db.ArtifactSet) .filter(db.ArtifactSet.set_id == set_id).count() > 0): return True else: return False def get_set_by_id(self, set_id): cs.ensure_read(self) with self.session() as session: result = (session.query(db.ArtifactSet) .filter(db.ArtifactSet.set_id == set_id).first()) if result: return self._db_to_mem_set(result) else: raise KeyError(set_id, self) def get_set_by_labels(self, labels): cs.ensure_read(self) labels = _check_labels_name(labels) with self.session() as session: result = (session.query(db.ArtifactSet) .filter(db.ArtifactSet.labels == labels) .order_by(db.ArtifactSet.created_at.desc()) .first()) if result: return self._db_to_mem_set(result) else: raise KeyError(labels, self) def delete_set(self, set_id): cs.ensure_delete(self, check_contains=False) with self.session() as session: num_deleted = (session.query(db.ArtifactSet). filter(db.ArtifactSet.set_id == set_id).delete()) (session.query(db.ArtifactSetMember). filter(db.ArtifactSetMember.set_id == set_id).delete()) if num_deleted == 0: raise KeyError(set_id, self) def run_info(self, artifact_id): with self.session() as session: result = (session.query(db.Run) .filter(db.Run.id == db.Artifact.run_id) .filter(db.Artifact.id == artifact_id) .first()) return result.info_with_datetimes def _filename(self, artifact_id): return self.blobstore._filename(artifact_id) DbRepo = PostgresRepo def _put_only_value(store, id, value, **kargs): return store.put(value, **kargs) def _put_set(store, id, value, **kargs): return store.put_set(value, **kargs) def _contains_set(store, id): return store.contains_set(id) def _delete_set(store, id): return store.delete_set(id) class ChainedRepo(ArtifactRepository): def __init__(self, repos): self.stores = repos def __contains__(self, id): return cs.chained_contains(self, id) def put(self, record): return cs.chained_put(self, record.id, record, put=_put_only_value) def put_set(self, artifact_set, read_through=False): return cs.chained_put(self, None, artifact_set, contains=_contains_set, put=_put_set) def get_by_id(self, artifact_id): def get(store, id): return store.get_by_id(id) return cs.chained_get(self, get, artifact_id, put=_put_only_value) def contains_set(self, id): return cs.chained_contains(self, id, contains=_contains_set) def get_set_by_id(self, set_id): def get(store, id): return store.get_set_by_id(id) return cs.chained_get(self, get, set_id, put=_put_set) def get_set_by_labels(self, set_name): def get(store, name): return store.get_set_by_labels(name) return cs.chained_get(self, get, set_name, put=_put_set) def delete_set(self, id): return cs.chained_delete(self, id, contains=_contains_set, delete=_delete_set) def get_by_value_id(self, value_id): def get(store, id): return store.get_by_value_id(id) return cs.chained_get(self, get, value_id, put=_put_only_value) def get_value(self, artifact): for store in self.stores: try: return store.get_value(artifact) except KeyError: pass raise KeyError(artifact, self) def delete(self, id): return cs.chained_delete(self, id) def _filename(self, id): return cs.chained_filename(self, id) ### ArtifactSet logic def _set_op(operator, *sets, labels=None): new_ids = t.reduce(operator, t.map(lambda s: s.artifact_ids, sets)) return ArtifactSet(new_ids, labels) set_union = t.partial(_set_op, ops.or_) set_difference = t.partial(_set_op, ops.sub) set_intersection = t.partial(_set_op, ops.and_) artifact_set_properties = ['id', 'artifact_ids', 'created_at', 'labels'] class ArtifactSet(namedtuple('ArtifactSet', artifact_set_properties)): def __new__(cls, artifact_ids, labels=None, created_at=None, id=None): artifact_ids = t.map(_artifact_id, artifact_ids) labels = _check_labels_name(labels) ids = frozenset(artifact_ids) if id: set_id = id else: set_id = hash(ids) created_at = created_at if created_at else datetime.utcnow() return super(ArtifactSet, cls).__new__(cls, set_id, ids, created_at, labels) @property def name(self): if self.labels is not None: return self.labels.get('name') def __eq__(self, other): if isinstance(other, self.__class__): return self.id == other.id return NotImplemented def artifacts_named(self, artifact_name): pass # return all artifacts found wrapped in list def add(self, artifact_or_id, labels=None): artifact_id = _artifact_id(artifact_or_id) return ArtifactSet(self.artifact_ids | {artifact_id}, labels) def remove(self, artifact_or_id, labels=None): artifact_id = _artifact_id(artifact_or_id) return ArtifactSet(self.artifact_ids - {artifact_id}, labels) def union(self, *sets, labels=None): return set_union(self, *sets, labels=labels) def __or__(self, other_set): return set_union(self, other_set) def difference(self, *sets, labels=None, repo=None): return set_difference(self, *sets, labels=labels) def __sub__(self, other_set): return set_difference(self, other_set) def intersection(self, *sets, labels=None): return set_intersection(self, *sets, labels=labels) def __and__(self, other_set): return set_intersection(self, other_set) def relabel(self, labels): labels = _check_labels_name(labels) return self._replace(labels=labels) def rename(self, name): return self.relabel({'name': name}) def put(self, repo=None): repo = repo if repo else get_default_repo() return repo.put_set(self) def proxy_dict(self, group_artifacts_of_same_name=False): """ Returns a lazy_proxy_dict of the artifacts that are contained in this set. See the documentation for lazy_proxy_dict for more information. """ return lazy_proxy_dict(self.artifact_ids, group_artifacts_of_same_name) def save_artifact(f, artifact_ids): def wrapped(*args, **kargs): artifact = f(*args, **kargs) artifact_ids.add(artifact.id) return artifact return wrapped class RepoSpy(wrapt.ObjectProxy): def __init__(self, repo): super(RepoSpy, self).__init__(repo) self.artifact_ids = set() self.put = save_artifact(repo.put, self.artifact_ids) self.get_by_id = save_artifact(repo.get_by_id, self.artifact_ids) self.get_by_value_id = save_artifact(repo.get_by_value_id, self.artifact_ids)
[docs]@contextmanager def capture_set(labels=None, initial_set=None): if initial_set: initial = set(t.map(_artifact_id, initial_set)) else: initial = set() repo = get_default_repo() spy = RepoSpy(repo) with using_repo(spy): result = [] yield result artifact_ids = spy.artifact_ids | initial result.append(ArtifactSet(artifact_ids, labels=labels).put(repo))
def coerce_to_artifact(artifact_or_id, repo=None): repo = repo if repo else get_default_repo() if isinstance(artifact_or_id, string_type): return repo.get_by_id(artifact_or_id) if isinstance(artifact_or_id, Artifact): return artifact_or_id if is_proxy(artifact_or_id): return artifact_or_id.artifact raise ValueError('Was unable to coerce object into an Artifact: {}' .format(artifact_or_id)) def coerce_to_artifacts(artifact_or_ids, repo=None): repo = repo if repo else get_default_repo() #TODO: bring this back when/if batch_get_by_id is added to chained repo # if all(isinstance(a, string_type) for a in artifact_or_ids): # return repo.batch_get_by_id(artifact_or_ids) return [coerce_to_artifact(a, repo) for a in artifact_or_ids]
[docs]class lazy_dict(object): def __init__(self, thunks): self.thunks = thunks self.realized = {} def __getstate__(self): return self.thunks def __setstate__(self, thunks): self.__init__(thunks) def __getitem__(self, key): if key in self.thunks: if key not in self.realized: self.realized[key] = self.thunks[key]() return self.realized[key] else: raise KeyError(key, self) def __setitem__(self, key, value): self.thunks[key] = lambda: value self.realized[key] = value def __delitem__(self, key): if key in self.thunks: del self.thunks[key] if key in self.realized: del self.realized[key] else: KeyError(key, self) def __contains__(self, key): return key in self.thunks def items(self): return ((key, self[key]) for key in self.thunks.keys()) def keys(self): return self.thunks.keys() def values(self): return (self[key] for key in self.thunks.keys()) def __repr__(self): return "lazy_dict({})".format( t.merge(t.valmap(lambda _: "...", self.thunks), self.realized))
[docs]def lazy_proxy_dict(artifacts_or_ids, group_artifacts_of_same_name=False): """ Takes a list of artifacts or artifact ids and returns a dictionary whose keys are the names of the artifacts. The values will be lazily loaded into proxies as requested. Parameters ---------- artifacts_or_ids : collection of artifacts or artifact ids (strings) group_artifacts_of_same_name: bool (default: False) If set to True then artifacts of the same name will be grouped together in one list. When set to False an exception will be raised """ if isinstance(artifacts_or_ids, dict): artifacts = t.valmap(coerce_to_artifact, artifacts_or_ids) lambdas = {name: (lambda a: lambda: a.proxy())(a) for name, a in artifacts.items()} return lazy_dict(lambdas) # else we have a collection artifacts = coerce_to_artifacts(artifacts_or_ids) by_name = t.groupby(lambda a: a.name, artifacts) singles = t.valfilter(lambda l: len(l) == 1, by_name) multi = t.valfilter(lambda l: len(l) > 1, by_name) lambdas = {name: (lambda a: lambda: a.proxy())(a[0]) for name, a in singles.items()} if group_artifacts_of_same_name and len(multi) > 0: lambdas = t.merge(lambdas, {name: (lambda artifacts: (lambda: [a.proxy() for a in artifacts]))(artifacts) for name, artifacts in multi.items()}) if not group_artifacts_of_same_name and len(multi) > 0: raise ValueError("""Only artifacts with distinct names can be used in a lazy_proxy_dict. Offending names: {} Use the option `group_artifacts_of_same_name=True` if you want a list of proxies to be returned under the respective key. """.format({n: len(a) for n, a in multi.items()})) return lazy_dict(lambdas)