Source code for dgitcore.datasets.auto

import os, sys, getpass, stat, glob, json, platform
import subprocess, time, traceback
from collections import OrderedDict
import fnmatch, re
from ..config import get_config
from ..plugins.common import plugins_get_mgr
from .common import clone as common_clone, init as common_init, post as common_post
from .files import add as files_add
from .history import get_history
from .detect import get_schema
from datetime import datetime

#####################################################
# Exports
#####################################################

__all__ = ['auto_update', 'auto_init', 'auto_get_repo']

def find_executable_files():
    """
    Find max 5 executables that are responsible for this repo.
    """
    files = glob.glob("*") + glob.glob("*/*") + glob.glob('*/*/*')
    files = filter(lambda f: os.path.isfile(f), files)
    executable = stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH
    final = []
    for filename in files:
        if os.path.isfile(filename):
            st = os.stat(filename)
            mode = st.st_mode
            if mode & executable:
                final.append(filename)
                if len(final) > 5:
                    break
    return final

[docs]def auto_init(autofile, force_init=False): """ Initialize a repo-specific configuration file to execute dgit Parameters ---------- autofile: Repo-specific configuration file (dgit.json) force_init: Flag to force to re-initialization of the configuration file """ if os.path.exists(autofile) and not force_init: try: autooptions = json.loads(open(autofile).read()) return autooptions except: print("Error in dgit.json configuration file") traceback.print_exc() raise Exception("Invalid configuration file") config = get_config() pluginmgr = plugins_get_mgr() print("Repo configuration file missing or corrupted. Creating one") print("Let us know a few details about your data repository") # Get the username username = getpass.getuser() revised = input("Please specify username [{}]".format(username)) if revised not in ["", None]: username = revised # Get the reponame thisdir = os.path.abspath(os.getcwd()) reponame = os.path.basename(thisdir) revised = input("Please specify repo name [{}]".format(reponame)) if revised not in ["", None]: reponame = revised # Get the default backend URL keys = pluginmgr.search('backend') keys = keys['backend'] keys = [k for k in keys if k[0] != "local"] remoteurl = "" backend = None if len(keys) > 0: backend = pluginmgr.get_by_key('backend', keys[0]) candidate = backend.url(username, reponame) revised = input("Please specify remote URL [{}]".format(candidate)) if revised in ["", None]: remoteurl = candidate else: remoteurl = revised # Get title... title = "" while title == "": title = input("One line summary of your repo:") if title == "": print("The repo requires a one line summary") else: break # Get description description = "" while description == "": description = input("Detailed description:") if description == "": print("The repo requires some text as well") else: break autooptions = OrderedDict([ ("username", username), ("reponame", reponame), ("remoteurl", remoteurl), ("title", title), ("description", description), ("working-directory", "."), ('track' ,OrderedDict([ ('includes', ['*.csv', '*.tsv', '*.txt','*.json', '*.xlsx', "*.sql", "*.hql"]), ('excludes', ['.git', '.svn', os.path.basename(autofile)]), ])), ('auto-push', False), ('pipeline' ,OrderedDict([])), ('import' ,OrderedDict([ ('directory-mapping' ,OrderedDict([ ('.', '') ])) ])), ('dependencies' ,OrderedDict([])) ]) # Gather options from each of the enabled plugins for p in ['validator', 'transformer']: keys = pluginmgr.search(p) keys = keys[p] options = OrderedDict() for k in keys: if k.name in options: continue mgr = pluginmgr.get_by_key(p, k) options[k.name] = mgr.autooptions() autooptions[p] = options keys = pluginmgr.search('metadata') keys = keys['metadata'] if len(keys) > 0: # => Select domains that be included. servers = [] for k in keys: server = pluginmgr.get_by_key('metadata', k) server = server.url.split("/")[2] servers.append(server) # Specify what should be included. Some of these should go ino # the metadata modules autooptions.update(OrderedDict([ ('metadata-management', OrderedDict([ ('servers', servers), ('include-code-history', find_executable_files()), ('include-preview', OrderedDict([ ('length', 512), ('files', ['*.txt', '*.csv', '*.tsv']) ])), ('include-data-history', True), ('include-validation', True), ('include-dependencies', True), ('include-schema', ['*.csv', '*.tsv']), ('include-tab-diffs', ['*.csv', '*.tsv']), ('include-platform', True), ]))])) with open(autofile, 'w') as fd: fd.write(json.dumps(autooptions, indent=4)) print("") print("Updated dataset specific config file: {}".format(autofile)) print("Please edit it and rerun dgit auto.") print("Tip: Consider committing dgit.json to the code repository.") #if platform.system() == "Linux": # subprocess.call(["xdg-open", autofile]) sys.exit()
[docs]def auto_get_repo(autooptions, debug=False): """ Automatically get repo Parameters ---------- autooptions: dgit.json content """ # plugin manager pluginmgr = plugins_get_mgr() # get the repo manager repomgr = pluginmgr.get(what='repomanager', name='git') repo = None try: if debug: print("Looking repo") repo = repomgr.lookup(username=autooptions['username'], reponame=autooptions['reponame']) except: # Clone the repo try: print("Checking and cloning if the dataset exists on backend") url = autooptions['remoteurl'] if debug: print("Doesnt exist. trying to clone: {}".format(url)) common_clone(url) repo = repomgr.lookup(username=autooptions['username'], reponame=autooptions['reponame']) if debug: print("Cloning successful") except: # traceback.print_exc() yes = input("Repo doesnt exist. Should I create one? [yN]") if yes == 'y': setup = "git" if autooptions['remoteurl'].startswith('s3://'): setup = 'git+s3' repo = common_init(username=autooptions['username'], reponame=autooptions['reponame'], setup=setup, force=True, options=autooptions) if debug: print("Successfully inited repo") else: raise Exception("Cannot load repo") repo.options = autooptions return repo
def get_files_to_commit(autooptions): """ Look through the local directory to pick up files to check """ workingdir = autooptions['working-directory'] includes = autooptions['track']['includes'] excludes = autooptions['track']['excludes'] # transform glob patterns to regular expressions includes = r'|'.join([fnmatch.translate(x) for x in includes]) excludes = r'|'.join([fnmatch.translate(x) for x in excludes]) or r'$.' matched_files = [] for root, dirs, files in os.walk(workingdir): # exclude dirs # dirs[:] = [os.path.join(root, d) for d in dirs] dirs[:] = [d for d in dirs if not re.match(excludes, d)] # exclude/include files files = [f for f in files if not re.match(excludes, f)] files = [f for f in files if re.match(includes, f)] files = [os.path.join(root, f) for f in files] matched_files.extend(files) return matched_files def auto_add(repo, autooptions, files): """ Cleanup the paths and add """ # Get the mappings and keys. mapping = { ".": "" } if (('import' in autooptions) and ('directory-mapping' in autooptions['import'])): mapping = autooptions['import']['directory-mapping'] # Apply the longest prefix first... keys = mapping.keys() keys = sorted(keys, key=lambda k: len(k), reverse=True) count = 0 params = [] for f in files: # Find the destination relativepath = f for k in keys: v = mapping[k] if f.startswith(k + "/"): #print("Replacing ", k) relativepath = f.replace(k + "/", v) break # Now add to repository count += files_add(repo=repo, args=[f], targetdir=os.path.dirname(relativepath)) return count def auto_update(autofile, force_init): # Gather the repo name... autooptions = auto_init(autofile, force_init) # Load repo from the dgit.json file repo = auto_get_repo(autooptions) # find all the files that must be collected files = get_files_to_commit(autooptions) if len(files) > 10: print("Large number ({}) files are being added.".format(len(files))) proceed = input("Do you wish to proceed? [yN] ") if proceed != 'y': return # Add the files to the repo count = auto_add(repo, autooptions, files) if count == 0: print("There is no change in repo") else: # Commit the changes ts = datetime.now().isoformat() message = input("Quick summary of changes? ") if message in [None, '']: message = "Automatic commit on {}".format(ts) repo.run('commit', ['-a', '-m', message]) # Add the dgit.json as a note in the dgit_config namespace autofile = os.path.abspath(autofile) repo.run('notes', ['--ref', 'dgit_config', 'add', '-F', autofile, 'HEAD']) # Push notes and commits to server print("Sync'ing with backend") if 'auto-push' in autooptions and autooptions['auto-push']: repo.run('push', ['origin', "refs/notes/*"]) repo.run('push', ['origin', 'master']) # Collect all the metadata and post common_post(repo)