Add code snippets

2024-08-28 17:22:21 +02:00
parent de2368c07e
commit 064a0f75cb
32 changed files with 2354 additions and 0 deletions
--- a/matlab/csvconverter/csv2catalog.m
+++ b/matlab/csvconverter/csv2catalog.m
@@ -0,0 +1,110 @@
+function [Catalog] = csv2catalog(csvFilePath, column_desc, idPrefix, sortCatalogByTime, useCsvHeader)
+% DESCRIPTION: Program to create the catalogue v2.0x in the
+%              Matlab format file from csv file.
+% INPUTS:
+% - csvFilePath :           path to text file with csv data. The separator between columns is defined by ‘,'.
+%     Description of the most popular catalog fields can be found at https://docs.cyfronet.pl/display/ISDOC/Catalog+-+description
+% - column_desc :           structure containing information how the catalog should be created
+%    - include: whether the field should be included in the resulting catalog
+%    - nameInCsv: name of the column in CSV file
+%    - inputType: type of the column in CSV file (REAL/INTEGER/TEXT/DATE_TIME/DATE/DATE_DAY/DATE_MONTH/DATE_YEAR/TIME)
+%    - inputTimeFormat: input format for reading time (if 'inputType' is one of time options)
+%    - inputTimeGroup: input group for merging dates
+%    - nameInCatalog: name of the column that should be insterted into resulting catalog
+%    - description: description of the column in the resulting catalog
+%    - format: format (display format) of the column in the resulting catalog
+%    - unit: unit of the column in the resulting catalog
+%    - fieldType: type of the column in the resulting catalog (e.g. Magnitude, Energy)
+% - idPrefix :            prefix of the ID column if the IDs should be generated (if the catalog doesn't contain
+%     the id or the column is not included ('include' is set to false)
+
+  %TODO handle multiple time columns
+  %TODO the script fails if any of the rows has an empty value at the end (but empty quotes is ok)
+
+  data = readAndCheckHeaders(csvFilePath, column_desc, useCsvHeader);
+  colCount = length(column_desc);
+  noCsvHeaderModifier = 1;
+  if useCsvHeader
+    noCsvHeaderModifier = 0;
+  end
+  rowCount = length(data) / colCount - 1 + noCsvHeaderModifier;
+  k = 1;    % column number in the generated catalog
+
+  if ~contains_id(column_desc)
+    if isempty(idPrefix)
+      [~, idPrefix] = fileparts(csvFilePath);
+    end
+    Catalog(k).field = 'ID';
+    Catalog(k).type = 3;
+    for row = 1 : rowCount
+      Catalog(k).val(row, 1) = { strcat(idPrefix, '_', num2str(row,'%04.f')) };
+    end
+    Catalog(k).unit = [];
+    Catalog(k).description = 'Event ID';
+    Catalog(k).fieldType = [];
+    k = 2;
+  end
+
+  [columnsInTimeGroup, columnsNotToInclude] = getTimeGroups(column_desc);
+
+  for col=1:colCount
+      current_col = column_desc(col);
+      if ~current_col.include || ismember(col, columnsNotToInclude)
+          continue;
+      end
+
+      inputTimeGroup = current_col.inputTimeGroup;
+      if ~isempty(current_col.inputTimeGroup)
+        timeGroupArray = columnsInTimeGroup(inputTimeGroup);
+        for columnInGroup = 2 : length(timeGroupArray)
+          current_col.inputTimeFormat = [current_col.inputTimeFormat "-" column_desc(timeGroupArray(columnInGroup)).inputTimeFormat];
+        end
+      end
+
+      Catalog(k).field = current_col.nameInCatalog;
+      Catalog(k).type = current_col.format;
+      for row = 1 : rowCount
+          rawValue = data{(colCount * (row - noCsvHeaderModifier)) + col, 1};
+          if ~isempty(current_col.inputTimeGroup)
+            timeGroupArray = columnsInTimeGroup(inputTimeGroup);
+            for columnInGroup = 2 : length(timeGroupArray)
+               rawValue = [rawValue "-" data{(colCount * (row - noCsvHeaderModifier)) + timeGroupArray(columnInGroup), 1}];
+            end
+          end
+          if isempty(rawValue)
+            if strcmp(current_col.nameInCatalog, 'ID')
+              error('ID of the event cannot be empty (row: %d)', row)
+            elseif isText(current_col)
+              Catalog(k).val(row, 1) = {''};
+            else
+              Catalog(k).val(row, 1) = NaN;
+            end
+          else
+            parsedValue = parseTextValue(rawValue, current_col.inputType, current_col.inputTimeFormat);
+            if strcmp(current_col.format, '5a')
+                  Catalog(k).val(row, 1) = { datestr(parsedValue, 'yyyy') };
+              elseif strcmp(current_col.format, '5b')
+                  Catalog(k).val(row, 1) = { datestr(parsedValue, 'yyyy-mm') };
+              else
+                  Catalog(k).val(row, 1) = parsedValue;
+              end
+          end
+      end
+      Catalog(k).unit = current_col.unit;
+      Catalog(k).description = current_col.description;
+      Catalog(k).fieldType = current_col.fieldType;
+      k=k+1;
+  end
+  if sortCatalogByTime
+    Catalog = sortByTime(Catalog);
+  end
+end
+
+function containsId = contains_id(column_desc)
+  idIdxs = find(strcmp(column_desc(1).nameInCatalog, 'ID'));
+  if isempty(idIdxs)
+    containsId = 0;
+  else
+    containsId = column_desc(idIdxs).include;
+  end
+end
--- a/matlab/csvconverter/csv2gdf.m
+++ b/matlab/csvconverter/csv2gdf.m
@@ -0,0 +1,88 @@
+% -----------------
+% Copyright © 2023 ACK Cyfronet AGH, Poland.
+% -----------------
+
+function [gdfFileName] = csv2gdf(csvFilePath, column_desc, description, useCsvHeader)
+% DESCRIPTION: Program to create GDF files (in Matlab format) from csv file. Performs a reverse action to the
+%    gdf2csv.m script.
+% INPUTS:
+% - csvFilePath :           path to text file with csv data. The separator between columns is defined by ‘,'.
+%     Description of the most popular GDF file formats can be found at https://docs.cyfronet.pl/display/ISDOC/GDF
+% - column_desc :           structure containing information how the gdf should be constructed
+% - description :           description written into the file
+
+    data = readAndCheckHeaders(csvFilePath, column_desc, useCsvHeader);
+    colCount = length(column_desc);
+    noCsvHeaderModifier = 1;
+    if useCsvHeader
+      noCsvHeaderModifier = 0;
+    end
+    rowCount = length(data) / colCount - 1 + noCsvHeaderModifier;
+
+    [FormatName] = 'GDF';
+    [FormatVersion] = 2.1;
+    [CRS] = 'n/a';
+    [TimeZone] = 'UTC';
+    [Description] = description;
+
+    [FieldDescription] = {};
+    [FieldType] = {};
+    [FieldUnit] = {};
+    [d] = struct();
+
+    [columnsInTimeGroup, columnsNotToInclude] = getTimeGroups(column_desc);
+    colInGdf = 1;    % column number in the generated gdf
+    for col=1:colCount
+          current_col = column_desc(col);
+          if ~current_col.include || ismember(col, columnsNotToInclude)
+              continue;
+          end
+
+          inputTimeGroup = current_col.inputTimeGroup;
+          if ~isempty(current_col.inputTimeGroup)
+            timeGroupArray = columnsInTimeGroup(inputTimeGroup);
+            for columnInGroup = 2 : length(timeGroupArray)
+              current_col.inputTimeFormat = [current_col.inputTimeFormat "-" column_desc(timeGroupArray(columnInGroup)).inputTimeFormat];
+            end
+          end
+
+          fieldName = current_col.nameInCatalog;
+          FieldDescription(colInGdf, 1) = fieldName;
+          FieldDescription(colInGdf, 2) = current_col.description;
+          FieldType(colInGdf, 1) = fieldName;
+          FieldType(colInGdf, 2) = current_col.format;
+          FieldUnit(colInGdf, 1) = fieldName;
+          FieldUnit(colInGdf, 2) = current_col.unit;
+          d.(fieldName) = [];
+          for row = 1 : rowCount
+              rawValue = data{(colCount * (row - noCsvHeaderModifier)) + col, 1};
+              if ~isempty(current_col.inputTimeGroup)
+                timeGroupArray = columnsInTimeGroup(inputTimeGroup);
+                for columnInGroup = 2 : length(timeGroupArray)
+                   rawValue = [rawValue "-" data{(colCount * (row - noCsvHeaderModifier)) + timeGroupArray(columnInGroup), 1}];
+                end
+              end
+              if isempty(rawValue)
+                  if isText(current_col)
+                      d.(fieldName)(row) = {''};
+                  else
+                      d.(fieldName)(row) = NaN;
+                  end
+              else
+                  parsedValue = parseTextValue(rawValue, current_col.inputType, current_col.inputTimeFormat);
+                  if strcmp(current_col.format, '5a')
+                      d.(fieldName)(row) = { datestr(parsedValue, 'yyyy') };
+                  elseif strcmp(current_col.format, '5b')
+                      d.(fieldName)(row) = { datestr(parsedValue, 'yyyy-mm') };
+                  else
+                      d.(fieldName)(row) = parsedValue;
+                  end
+              end
+          end
+          colInGdf = colInGdf + 1;
+    end
+
+    [~, gdfFileName, ~] = fileparts(csvFilePath);
+    save(strcat(gdfFileName, '.mat'), 'FormatName', 'FormatVersion', 'CRS', 'TimeZone', 'Description', ...
+            'FieldDescription', 'FieldType', 'FieldUnit', 'd', '-v7')
+end
--- a/matlab/csvconverter/getTimeGroups.m
+++ b/matlab/csvconverter/getTimeGroups.m
@@ -0,0 +1,23 @@
+% -----------------
+% Copyright © 2023 ACK Cyfronet AGH, Poland.
+% -----------------
+function [columnsInTimeGroup, columnsNotToInclude] = getTimeGroups(column_desc)
+% DESCRIPTION: Script iterating through column_desc and returning column indexes grouped by the same
+% inputTimeGroup. The second output is array of all the other columns indexes than first in their own respective time group
+% INPUTS:
+% - column_desc :           structure containing definition of the CSV columns and their mapping to the final object
+  columnsInTimeGroup = containers.Map();
+  columnsNotToInclude = [];
+
+  for i=1:length(column_desc)
+    inputTimeGroup = column_desc(i).inputTimeGroup;
+    if ~isempty(inputTimeGroup)
+      if ~ismember(inputTimeGroup, columnsInTimeGroup.keys)
+        columnsInTimeGroup(inputTimeGroup) = [i];
+      else
+        columnsInTimeGroup(inputTimeGroup) = cat(1, columnsInTimeGroup(inputTimeGroup), i);
+        columnsNotToInclude = cat(1, columnsNotToInclude, i);
+      end
+    end
+  end
+end
--- a/matlab/csvconverter/isText.m
+++ b/matlab/csvconverter/isText.m
@@ -0,0 +1,10 @@
+% -----------------
+% Copyright © 2023 ACK Cyfronet AGH, Poland.
+% -----------------
+
+function isText = isText(col_desc)
+% DESCRIPTION: Function checking if given column is of text type
+% INPUTS:
+% - col_desc :           structure containing information how the column should be constructed
+    isText = strcmp(col_desc.inputType, 'TEXT') | strcmp(col_desc.format, '5a') | strcmp(col_desc.format, '5b');
+end
--- a/matlab/csvconverter/parseTextValue.m
+++ b/matlab/csvconverter/parseTextValue.m
@@ -0,0 +1,35 @@
+% -----------------
+% Copyright © 2023 ACK Cyfronet AGH, Poland.
+% -----------------
+
+function parsedValue = parseTextValue(rawValue, type, timeFormat)
+% DESCRIPTION: Program that parses and returns value read from the text (a cell from a CSV file).
+% INPUTS:
+% - rawValue :              value to parse
+% - type :                  type of the value as defined by CsvColumnContentType.java
+% - timeFormat :            if the rawValue contains time, this format is used to parse it
+
+    switch type
+      case {'REAL', 'INTEGER'}
+        try
+          parsedValue = str2num(rawValue);
+        catch
+          error('Cannot parse number input (type: %s): %s', type, rawValue);
+        end
+        if isempty(parsedValue)
+          % we checked if the value is empty before parsing (and such value will not be parsed), if the value is empty
+          % here (after parsing), it means that it was in a wrong format and could not be parsed
+          error('Cannot parse number input (type: %s): %s', type, rawValue);
+        end
+      case 'TEXT'
+        parsedValue = {rawValue};
+      case 'DATE_TIME'
+        try
+          parsedValue = datenum(rawValue, timeFormat);
+        catch
+          error('Invalid input time format specification or CSV content to parse (%s)', rawValue);
+        end
+      otherwise
+        error('Unexpected input column type %s', type);
+    end
+end
--- a/matlab/csvconverter/readAndCheckHeaders.m
+++ b/matlab/csvconverter/readAndCheckHeaders.m
@@ -0,0 +1,33 @@
+% -----------------
+% Copyright © 2023 ACK Cyfronet AGH, Poland.
+% -----------------
+
+function data = readAndCheckHeaders(csvFilePath, column_desc, doCheckHeaders)
+% DESCRIPTION: Program that reads content from the CSV file, checking if the content matches the headers defined
+%    in the column_desc structure. The returned value is a cell with all values from the csv file.
+% INPUTS:
+% - csvFilePath :           path to the CSV file
+% - column_desc :           structure containing definition of the CSV columns and their mapping to the final object
+
+    fid = fopen(csvFilePath);
+    data = textscan(fid, '%q', 'Delimiter', ','){1};   % cell with all values from the csv file
+    fclose(fid);
+    if doCheckHeaders
+        check_headers(data, column_desc);
+    end
+end
+
+function check_headers(data, column_desc)
+  colCount = length(column_desc);
+  headers = data(1:colCount);
+  for i=1:colCount
+    if ~strcmp(column_desc(i).nameInCsv, headers(i))
+      error('Expected column %s, but found %s in CSV headers', column_desc(i).nameInCsv, char(headers(i)));
+    end
+  end
+
+  if mod(length(data), colCount) ~= 0
+    error('Improper number of values in one of the rows');
+  end
+
+end