Source code for dgitcore.contrib.instrumentations.content
#!/usr/bin/env python
import os, sys
from dgitcore.plugins.instrumentation import InstrumentationBase
from dgitcore.config import get_config
from hashlib import sha1
import mimetypes
from messytables import (CSVTableSet, type_guess, headers_guess,
offset_processor, DateType, StringType,
DecimalType, IntegerType,
DateUtilType, BoolType,
rowset_as_jts, headers_and_typed_as_jts)
def compute_sha1(filename):
h = sha1()
fd = open(filename)
while True:
buf = fd.read(0x1000000)
if buf in [None, ""]:
break
h.update(buf.encode('utf-8'))
return h.hexdigest()
[docs]class ContentInstrumentation(InstrumentationBase):
"""Instrumentation to extract content summaries including mimetypes, sha1 signature and schema where possible.
"""
def __init__(self):
self.enable = 'y'
super(ContentInstrumentation, self).__init__('content',
'v0',
"Basic content analysis")
[docs] def update(self, config):
# Update the mime, sha1 of the files
for i in range(len(config['files'])):
filename = config['files'][i]['filename']
if os.path.exists(filename):
u = {
'mimetype': mimetypes.guess_type(filename)[0],
'sha1': compute_sha1(filename)
}
if filename.lower().endswith('sv'): # csv/tsv
rows = CSVTableSet(csv_file).tables[0]
guessed_types = type_guess(rows.sample)
u['schema'] = guessed_types
config['files'][i].update(u)
return config
def setup(mgr):
obj = ContentInstrumentation()
mgr.register('instrumentation', obj)