Source code for cis_interface.dataio.AsciiTable

import numpy as np
from cis_interface.interface.scanf import scanf
from cis_interface.dataio.AsciiFile import AsciiFile
from cis_interface import backwards
try:
    from astropy.io import ascii as apy_ascii
    from astropy.table import Table as apy_Table
    _use_astropy = True
except:  # pragma: no cover
    apy_ascii, apy_Table = None, None
    print("astropy is not installed, reading/writing as an array will be " +
          "disabled. astropy can be installed using 'pip install astropy'.")
    _use_astropy = False


_default_args = {'column': '\t'}
_fmt_char = backwards.unicode2bytes('%')


[docs]def nptype2cformat(nptype): r"""Convert a numpy data type to a c format string. Args: nptype (str or numpy.dtype): Numpy data type that should be converted. Returns: str: Corresponding c format specification string. Raises: TypeError: If nptype is not a string or numpy.dtype. ValueError: If a matching format string cannot be determined. """ if isinstance(nptype, np.dtype): t = nptype elif isinstance(nptype, str): t = np.dtype(nptype) else: raise TypeError("Input must be a string or a numpy.dtype") if t in [np.dtype(x) for x in ["float_", "float16", "float32", "float64"]]: cfmt = "%g" # Ensures readability elif t == np.dtype("int8"): cfmt = "%hhd" elif t == np.dtype("short"): cfmt = "%hd" elif t == np.dtype("intc"): cfmt = "%d" elif t == np.dtype("int_"): cfmt = "%ld" elif t == np.dtype("longlong"): # pragma: no cover # If it is different than C long cfmt = "%lld" elif t == np.dtype("uint8"): cfmt = "%hhu" elif t == np.dtype("ushort"): cfmt = "%hu" elif t == np.dtype("uintc"): cfmt = "%u" elif t == np.dtype("uint64"): # Platform dependent cfmt = "%lu" elif t == np.dtype("ulonglong"): # pragma: no cover cfmt = "%llu" elif np.issubdtype(t, np.dtype("S")): # cfmt = '%s' if t.itemsize is 0: cfmt = '%s' else: cfmt = "%" + str(t.itemsize) + "s" elif np.issubdtype(t, np.dtype("U")): # cfmt = '%s' if t.itemsize is 0: cfmt = '%s' else: cfmt = "%" + t.str[-1] + "s" else: raise ValueError("No format specification string for dtype %s" % t) # Short and long specifiers not supported by python scanf # cfmt = cfmt.replace("h", "") # cfmt = cfmt.replace("l", "") return cfmt
[docs]def cformat2nptype(cfmt): r"""Convert a c format string to a numpy data type. Args: cfmt (str): c format that should be translated. Returns: str: Corresponding numpy data type. Raises: TypeError: if cfmt is not a string. ValueError: If the c format does not begin with '%'. ValueError: If the c format does not contain type info. ValueError: If the c format cannot be translated to a numpy datatype. """ # TODO: this may fail on 32bit systems where C long types are 32 bit if not isinstance(cfmt, backwards.bytes_type): raise TypeError("Input must be of type %s, not %s" % (backwards.bytes_type, type(cfmt))) elif not cfmt.startswith(_fmt_char): raise ValueError("Provided C format string (%s) " % cfmt + "does not start with '%%'") elif len(cfmt) == 1: raise ValueError("Provided C format string (%s) " % cfmt + "does not contain type info") out = None cfmt_str = backwards.bytes2unicode(cfmt) if cfmt_str[-1] in ['f', 'F', 'e', 'E', 'g', 'G']: out = 'float64' elif cfmt_str[-1] in ['d', 'i']: if 'hh' in cfmt_str: # short short, single char out = 'int8' elif cfmt_str[-2] == 'h': # short out = 'short' elif 'll' in cfmt_str: out = 'longlong' # long long elif cfmt_str[-2] == 'l': out = 'int_' # long (broken in python) else: out = 'intc' # int, platform dependent elif cfmt_str[-1] in ['u', 'o', 'x', 'X']: if 'hh' in cfmt_str: # short short, single char out = 'uint8' elif cfmt_str[-2] == 'h': # short out = 'ushort' elif 'll' in cfmt_str: out = 'ulonglong' # long long elif cfmt_str[-2] == 'l': out = 'uint64' # long (broken in python) else: out = 'uintc' # int, platform dependent elif cfmt_str[-1] in ['c', 's']: lstr = cfmt_str[1:-1] if lstr: out = backwards.np_dtype_str + lstr else: out = backwards.np_dtype_str else: raise ValueError("Could not find match for format str %s" % cfmt) return np.dtype(out).str
[docs]def cformat2pyscanf(cfmt): r"""Convert a c format specification string to a version that the python scanf module can use. Args: cfmt (str): C format specification string. Returns: str: Version of cfmt that can be parsed by scanf. Raises: TypeError: if cfmt is not a bytes/str. ValueError: If the c format does not begin with '%'. ValueError: If the c format does not contain type info. """ if not isinstance(cfmt, backwards.bytes_type): raise TypeError("Input must be of type %s." % backwards.bytes_type) elif not cfmt.startswith(_fmt_char): raise ValueError("Provided C format string (%s) " % cfmt + "does not start with '%%'") elif len(cfmt) == 1: raise ValueError("Provided C format string (%s) " % cfmt + "does not contain type info") # Hacky, but necessary to handle concatenation of a single byte cfmt_str = backwards.bytes2unicode(cfmt) out = backwards.bytes2unicode(_fmt_char) out += cfmt_str[-1] out = out.replace('h', '') out = out.replace('l', '') return backwards.unicode2bytes(out)
[docs]class AsciiTable(AsciiFile): def __init__(self, filepath, io_mode, format_str=None, dtype=None, column_names=None, use_astropy=False, **kwargs): r"""Class for reading/writing an ASCII table. Args: filepath (str): Full path to the file that should be read from or written to. io_mode (str): Mode that should be used to open the file. Valid values include 'r', 'w', and None. None can be used to indicate an in memory table that will not be read from or written to a file. format_str (str): Format string that should be used to format output in the case that the io_mode is 'w' (write). It is not required if the io_mode is any other value. dtype (str): Numpy structured data type for each row. If not provided it is set using format_str. Defaults to None. column_names (list, optional): List of column names. Defaults to None. use_astropy (bool, optional): If True, astropy is used to determine a table's format if it is installed. If False, a format string must be contained in the table. Defaults to False. comment (str, optional): String that should be used to identify comments. Defaults to '#'. newline (str, optional): String that should be used to identify the end of a line. Defaults to '\n'. column (str, optional): String that should be used to separate columns. Defaults to '\t'. Raises: RuntimeError: If format_str is not provided and the io_mode is 'w' (write). """ if use_astropy: self.use_astropy = _use_astropy else: self.use_astropy = False super(AsciiTable, self).__init__(filepath, io_mode, **kwargs) self.column_names = None # Add default args specific to ascii table for k, v in _default_args.items(): if not hasattr(self, k): setattr(self, k, v) self.column = backwards.unicode2bytes(self.column) try: self._format_str = backwards.unicode2bytes(format_str) except TypeError: if isinstance(dtype, (str, np.dtype)): self._dtype = np.dtype(dtype) else: if (io_mode == 'r'): self.discover_format_str() else: raise RuntimeError("'format_str' must be provided for output") if isinstance(column_names, list) and (len(column_names) == self.ncols): self.column_names = column_names @property def format_str(self): if not hasattr(self, '_format_str'): if hasattr(self, '_dtype'): fmts = [backwards.unicode2bytes(nptype2cformat(self.dtype[i])) for i in range(len(self.dtype))] self._format_str = backwards.unicode2bytes( self.column.join(fmts) + self.newline) else: # pragma: debug raise RuntimeError("Format string not set " + "and cannot be determined.") return self._format_str @property def dtype(self): if not hasattr(self, '_dtype'): # typs = [(f[-1] + str(i), np.dtype(cformat2nptype(f))) # for i, f in enumerate(self.fmts)] typs = [('f' + str(i), np.dtype(cformat2nptype(f))) for i, f in enumerate(self.fmts)] self._dtype = np.dtype(typs) return self._dtype @property def fmts(self): r"""List of formats in format string.""" return self.format_str.split(self.newline)[0].split(self.column) @property def ncols(self): # return len(self.fmts) return self.format_str.count(_fmt_char)
[docs] def update_format_str(self, new_format_str): r"""Change the format string and update the data type. Args: new_format_str (str): New format string. """ self._format_str = backwards.unicode2bytes(new_format_str) if hasattr(self, '_dtype'): delattr(self, '_dtype')
[docs] def update_dtype(self, new_dtype): r"""Change the data type and update the format string. Args: new_dtype (str or np.dtype): New numpy data type. """ if isinstance(new_dtype, np.dtype): pass elif isinstance(new_dtype, str): new_dtype = np.dtype(new_dtype) self._dtype = new_dtype if hasattr(self, '_format_str'): delattr(self, '_format_str')
[docs] def writeheader(self, names=None): r"""Write header including column names and format. Args: names (list, optional): List of names of columns. Defaults to None and the ones provided at construction are used if they exist. Otherwise, no names are written. """ self.writenames(names=names) self.writeformat()
[docs] def writenames(self, names=None): r"""Write column names to file. Args: names (list, optional): List of names of columns. Defaults to None and the ones provided at construction are used if they exist. Otherwise, no names are written. Raises: IndexError: If there are not enough names for all of the columns. """ if names is None: names = self.column_names if names is None: return if len(names) != self.ncols: raise IndexError("The number of names must match the number of columns.") names = [backwards.unicode2bytes(n) for n in names] line = (self.comment + backwards.unicode2bytes(' ') + self.column.join(names) + self.newline) self.writeline_full(line)
[docs] def writeformat(self): r"""Write the format string to the file.""" line = self.comment + backwards.unicode2bytes(' ') + self.format_str self.writeline_full(line)
[docs] def readline(self): r"""Continue reading lines until a valid line (uncommented) is encountered and return the arguments found there. Returns: tuple (bool, tuple): End of file flag and the arguments that were read from the line. If the end of file is reached, None is returned. """ eof, line = False, None while (not eof) and (line is None): eof, line = self.readline_full(validate=True) if (not line) or eof: args = None else: args = self.process_line(line) return eof, args
[docs] def writeline(self, *args): r"""Write arguments to a file in the table format. Args: \*args: Any number of arguments that should be written to the file. """ if self.is_open: line = self.format_line(*args) else: line = backwards.unicode2bytes('') self.writeline_full(line, validate=True)
[docs] def readline_full(self, validate=False): r"""Read a line and return it if it is not a comment. Args: validate (bool, optional): If True, the line is checked to see if it matches the expected table format. Defaults to False. Returns: tuple (bool, str): End of file flag and the line that was read (an empty string if the end of file was encountered). If the line is a comment, None is returned. """ eof, line = super(AsciiTable, self).readline_full() if self.is_open and (not eof) and (line is not None) and validate: self.validate_line(line) return eof, line
[docs] def writeline_full(self, line, validate=False): r"""Write a line to the file in its present state. Args: line (str): Line to be written. validate (bool, optional): If True, the line is checked to see if it matches the expected table format. Defaults to False. """ if self.is_open and isinstance(line, str) and validate: self.validate_line(line) super(AsciiTable, self).writeline_full(line)
[docs] def format_line(self, *args): r"""Create a line from the provided arguments using the table format. Args: \*args: Arguments to create line from. Returns: str: The line created from the arguments. Raises: RuntimeError: If the incorrect number of arguments are passed. """ if len(args) < self.ncols: raise RuntimeError("Incorrect number of arguments.") out = backwards.bytes2unicode(self.format_str) % args return backwards.unicode2bytes(out)
[docs] def process_line(self, line): r"""Extract values from the columns in the line using the table format. Args: line (str): String to extract arguments from. Returns: tuple: The arguments extracted from line. """ new_fmt = (self.column.join( [backwards.unicode2bytes(cformat2pyscanf(f)) for f in self.fmts]) + self.newline) out = scanf(backwards.bytes2unicode(new_fmt), backwards.bytes2unicode(line)) return out
[docs] def validate_line(self, line): r"""Assert that the line matches the format string and produces the expected number of values. Raises: TypeError: If the line is not a bytes/str. AssertionError: If the line does not match the format string. """ if not isinstance(line, backwards.bytes_type): raise TypeError("Line must be of type %s, not %s." % (backwards.bytes_type, type(line))) args = self.process_line(line) if args is None or (len(args) != self.ncols): raise AssertionError("The line does not match the format string.")
[docs] def discover_format_str(self): r"""Determine the format string by reading it from the file. The format string is assumed to start with a comment and contain C-style format codes (e.g. '%f'). Raises: RuntimeError: If a format string cannot be located within the file. """ if self.use_astropy: tab = apy_ascii.read(self.filepath, **getattr(self, 'astropy_kwargs', {})) self._arr = tab.as_array() self._dtype = self._arr.dtype if getattr(self, 'column_names', None) is None: self.column_names = [c for c in tab.columns] else: comment_list = [] out = None with open(self.filepath, 'rb') as fd: for line in fd: if line.startswith(self.comment): sline = line.lstrip(self.comment) sline = sline.lstrip(backwards.unicode2bytes(' ')) fmts = sline.split(self.column) is_fmt = [f.startswith(_fmt_char) for f in fmts] if sum(is_fmt) == len(fmts): out = sline break comment_list.append(sline) if out is None: # pragma: debug raise Exception("Could not locate a line containing format descriptors.") self._format_str = backwards.unicode2bytes(out) # Do column names if getattr(self, 'column_names', None) is None: self.column_names = None for sline in comment_list: names = sline.split(self.newline)[0].split(self.column) if len(names) == self.ncols: self.column_names = [ backwards.bytes2unicode(n) for n in names] break # Do string lengths str_fmt = backwards.unicode2bytes('%s') if str_fmt in self._format_str: fmts = self.fmts idx_str = [] for i, ifmt in enumerate(fmts): if ifmt == str_fmt: idx_str.append(i) max_len = {i: 0 for i in idx_str} with open(self.filepath, 'rb') as fd: for line in fd: if line.startswith(self.comment): continue cols = line.split(self.newline)[0].split(self.column) for i in idx_str: max_len[i] = max(max_len[i], len(cols[i])) for i in idx_str: fmts[i] = backwards.unicode2bytes('%' + str(max_len[i]) + 's') new_format_str = self.column.join(fmts) + self.newline self.update_format_str(new_format_str)
@property def arr(self): r"""Numpy array of table contents if opened in read mode.""" if self.io_mode == 'w': return None if not hasattr(self, '_arr'): self._arr = self.read_array() return self._arr
[docs] def read_array(self, names=None): r"""Read the table in as an array. Args: names (list, optional): List of column names to label columns. If not provided, existing names are used if they exist. Defaults to None. Returns: np.ndarray: Array of table contents. Raises: ValueError: If names are provided, but not the same number as there are columns. """ if names is None: names = self.column_names if (names is not None) and (len(names) != self.ncols): raise ValueError("The number of names does not match the number of columns") if hasattr(self, '_arr'): return self._arr if self.use_astropy: arr = apy_ascii.read(self.filepath, names=names).as_array() else: with open(self.filepath, 'r') as fd: arr = np.genfromtxt(fd, comments=backwards.bytes2unicode(self.comment), delimiter=backwards.bytes2unicode(self.column), dtype=self.dtype, autostrip=True, names=names) return arr
[docs] def write_array(self, array, names=None, skip_header=False): r"""Write a numpy array to the table. Args: array (np.ndarray): Array to be written. names (list, optional): List of column names to write out. If not provided, existing names are used if they exist. Defaults to None. skip_header (bool, optional): If True, no header information is written (it is assumed it was already written. Defaults to False. Raises: ValueError: If names are provided, but not the same number as there are columns. """ fmt = backwards.bytes2unicode(self.format_str.split(self.newline)[0]) column = backwards.bytes2unicode(self.column) comment = backwards.bytes2unicode(self.comment) newline = backwards.bytes2unicode(self.newline) if skip_header: names = None else: if names is None: names = self.column_names if (names is not None) and (len(names) != self.ncols): raise ValueError("The number of names does not match " + "the number of columns") if self.use_astropy: table = apy_Table(array) if skip_header: table_format = 'no_header' else: table_format = 'commented_header' table.meta["comments"] = [fmt] apy_ascii.write(table, self.filepath, delimiter=column, comment=comment + ' ', format=table_format, names=names) else: if skip_header: head = '' else: head = fmt if names is not None: head = column.join(names) + newline + " " + head np.savetxt(self.filepath, array, fmt=fmt, delimiter=column, comments=comment + ' ', newline=newline, header=head)
[docs] def array_to_bytes(self, arr=None, order='C'): r"""Convert arr to bytestring. Args: arr (np.ndarray, optional): Array to write to bytestring. If None the array of table data is used. order (str, optional): Order that array should be written to the bytestring. Defaults to 'C'. Returns: str: Bytestring. Raises: TypeError: If the provided array is not a numpy array. ValueError: If the array is not the correct type. """ if arr is None: arr = self.arr if not isinstance(arr, np.ndarray): raise TypeError("Provided array must be an array.") if (arr.dtype != self.dtype): if (arr.ndim != 2) or (arr.shape[1] != len(self.dtype)): raise ValueError("Data types do not match.") arr1 = np.empty(arr.shape[0], dtype=self.dtype) for i, n in enumerate(self.dtype.names): arr1[n] = arr[:, i] else: arr1 = arr if order == 'F': out = backwards.unicode2bytes('') for n in arr1.dtype.names: out = out + arr1[n].tobytes() else: out = arr1.tobytes(order='C') return out
[docs] def bytes_to_array(self, data, order='C'): r"""Process bytes according to the table format and return it as an array. Args: data (bytes): Byte string of table data. order (str, optional): Order of data for reshaping. Defaults to 'C'. Returns: np.ndarray: Numpy array containing data from bytes. """ if (len(data) % self.dtype.itemsize) != 0: raise RuntimeError("Data length (%d) must a multiple of the itemsize (%d)." % (len(data), self.dtype.itemsize)) nrows = len(data) // self.dtype.itemsize if order == 'F': arr = np.empty((nrows,), dtype=self.dtype) prev = 0 for i in range(len(self.dtype)): idata = data[prev:(prev + (nrows * self.dtype[i].itemsize))] arr[self.dtype.names[i]] = np.fromstring(idata, dtype=self.dtype[i]) prev += len(idata) else: arr = np.fromstring(data, dtype=self.dtype) if (len(arr) % self.ncols) != 0: raise ValueError("Returned data does not match") nrows = len(arr) // self.ncols arr.reshape((nrows, self.ncols), order=order) return arr
[docs] def read_bytes(self, order='C'): r"""Read the table in as array and encode as bytes. Args: order (str, optional): Order that array should be written to the bytestring. Defaults to 'C'. Returns: bytes: Array as bytes. """ arr = self.read_array() out = self.array_to_bytes(arr, order=order) return out
[docs] def write_bytes(self, data, order='C', names=None): r"""Write a numpy array to the table. Args: data (bytes): Bytes string to be interpreted as array and written to file. order (str, optional): Order of data for reshaping. Defaults to 'C'. names (list, optional): List of column names to write out. If not provided, existing names are used if they exist. Defaults to None. Raises: ValueError: If names are provided, but not the same number as there are columns. """ arr = self.bytes_to_array(data, order=order) self.write_array(arr, names=names)