Module create.table in plugin tabular v0.5.3
Create a table from supported source input types.
| Author(s) | Markus Binsteiner (markus@frkl.io) |
| Tags | tabular |
| Python class | kiara_plugin.tabular.modules.table.CreateTableModule |
Module configuration options
Configuration class: kiara_plugin.tabular.modules.table.CreateTableModuleConfig
| Name | Description | Type | Required? | Default |
|---|---|---|---|---|
| source_type | The value type of the source value. | string | true | null |
| target_type | The value type of the target. | string | true | null |
| constants | Value constants for this module. | object | false | null |
| defaults | Value defaults for this module. | object | false | null |
| ignore_errors | Whether to ignore convert errors and omit the failed items. | boolean | false | false |
Module source code
class CreateTableModule(CreateFromModule): """Create a table from supported source input types."""
_module_type_name = "create.table" _config_cls = CreateTableModuleConfig
def create_optional_inputs( self, source_type: str, target_type ) -> Union[Mapping[str, Mapping[str, Any]], None]:
if source_type == "file": return { "first_row_is_header": { "type": "boolean", "optional": True, "doc": "Whether the first row of a (csv) file is a header row. If not provided, kiara will try to auto-determine. Ignored if not a csv file.", } }
return None
def create__table__from__file(self, source_value: Value, optional: ValueMap) -> Any: """Create a table from a file, trying to auto-determine the format of said file.
Currently supported input file types:
- csv - parquet """
input_file: KiaraFile = source_value.data
if input_file.file_name.endswith(".csv"): return self.import_csv_file(source_value, optional) elif input_file.file_name.endswith(".parquet"): return self.import_parquet_file(source_value, optional)
def import_parquet_file( self, source_value: Value, optional: ValueMap ) -> KiaraTable: """Create a table from a parquet file value."""
import pyarrow.parquet as pq
# TODO: use memory mapping to optimize memory usage?
input_file: KiaraFile = source_value.data imported_data = None errors = []
try: imported_data = pq.read_table(input_file.path) except Exception as e: errors.append(e)
if imported_data is None: raise KiaraProcessingException( f"Failed to import parquet file '{input_file.path}'." )
return KiaraTable.create_table(imported_data)
def import_csv_file(self, source_value: Value, optional: ValueMap) -> KiaraTable: """Create a table from a csv file value.""" import csv as py_csv
from pyarrow import csv
input_file: KiaraFile = source_value.data imported_data = None errors = []
has_header = optional.get_value_data("first_row_is_header") if has_header is None: try: has_header = True with open(input_file.path, "rt") as csvfile: sniffer = py_csv.Sniffer() has_header = sniffer.has_header(csvfile.read(2048)) csvfile.seek(0) except Exception as e: # TODO: add this to the procss log log_message( "csv_sniffer.error", file=input_file.path, error=str(e), details="assuming csv file has header", )
try: if has_header: imported_data = csv.read_csv(input_file.path) else: read_options = csv.ReadOptions(autogenerate_column_names=True) imported_data = csv.read_csv(input_file.path, read_options=read_options) except Exception as e: errors.append(e)
if imported_data is None: raise KiaraProcessingException( f"Failed to import csv file '{input_file.path}'." )
# import pandas as pd # df = pd.read_csv(input_file.path) # imported_data = pa.Table.from_pandas(df)
return KiaraTable.create_table(imported_data)
# def create__table__from__csv_file(self, source_value: Value) -> Any: # """Create a table from a csv_file value.""" # # from pyarrow import csv # # input_file: FileModel = source_value.data # imported_data = csv.read_csv(input_file.path) # # # import pandas as pd # # df = pd.read_csv(input_file.path) # # imported_data = pa.Table.from_pandas(df) # # return KiaraTable.create_table(imported_data)
def create__table__from__file_bundle(self, source_value: Value) -> Any: """Create a table value from a text file_bundle.
The resulting table will have (at a minimum) the following columns: - id: an auto-assigned index - rel_path: the relative path of the file (from the provided base path) - content: the text file content """
import pyarrow as pa
bundle: KiaraFileBundle = source_value.data
columns = FILE_BUNDLE_IMPORT_AVAILABLE_COLUMNS
ignore_errors = self.get_config_value("ignore_errors") file_dict = bundle.read_text_file_contents(ignore_errors=ignore_errors)
# TODO: use chunks to save on memory tabular: Dict[str, List[Any]] = {} for column in columns: for index, rel_path in enumerate(sorted(file_dict.keys())):
if column == "content": _value: Any = file_dict[rel_path] elif column == "id": _value = index elif column == "rel_path": _value = rel_path else: file_model = bundle.included_files[rel_path] _value = getattr(file_model, column)
tabular.setdefault(column, []).append(_value)
table = pa.Table.from_pydict(tabular) return KiaraTable.create_table(table)