Added scripts converting mseeds from Bogdanka to seisbench format, extended readme, modidified logging

This commit is contained in:
2023-09-26 10:50:46 +02:00
parent 78ac51478c
commit aa39980573
15 changed files with 1788 additions and 66 deletions

File diff suppressed because one or more lines are too long

View File

@@ -88,8 +88,8 @@
"</div>"
],
"text/plain": [
" Datetime X Y Depth Mw \n",
"0 2020-01-01 10:09:42.200 5.582503e+06 5.702646e+06 0.7 2.469231 \\\n",
" Datetime X Y Depth Mw \\\n",
"0 2020-01-01 10:09:42.200 5.582503e+06 5.702646e+06 0.7 2.469231 \n",
"\n",
" Phases mseed_name \n",
"0 Pg BRDW 2020-01-01 10:09:44.400, Sg BRDW 2020-... 20200101100941.mseed "
@@ -101,7 +101,7 @@
}
],
"source": [
"input_path = str(Path.cwd().parent) + \"/data/igf/\"\n",
"input_path = str(Path.cwd().parent) + \"/datasets/igf/\"\n",
"catalog = pd.read_excel(input_path + \"Catalog_20_21.xlsx\", index_col=0)\n",
"catalog.head(1)"
]
@@ -317,7 +317,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Traces converted: 35784it [00:52, 679.39it/s]\n"
"Traces converted: 35784it [01:01, 578.58it/s]\n"
]
}
],
@@ -339,8 +339,10 @@
" continue\n",
" if os.path.exists(input_path + \"mseeds/mseeds_2020/\" + event.mseed_name):\n",
" mseed_path = input_path + \"mseeds/mseeds_2020/\" + event.mseed_name \n",
" else:\n",
" elif os.path.exists(input_path + \"mseeds/mseeds_2021/\" + event.mseed_name):\n",
" mseed_path = input_path + \"mseeds/mseeds_2021/\" + event.mseed_name \n",
" else: \n",
" continue\n",
" \n",
" \n",
" stream = get_mseed(mseed_path)\n",
@@ -374,6 +376,8 @@
" # trace_params[f\"trace_{pick.phase_hint}_status\"] = pick.evaluation_mode\n",
" \n",
" writer.add_trace({**event_params, **trace_params}, data)\n",
"\n",
" # break\n",
" \n",
" "
]
@@ -393,7 +397,25 @@
"metadata": {},
"outputs": [],
"source": [
"data = sbd.WaveformDataset(output_path, sampling_rate=100)"
"data = sbd.WaveformDataset(output_path, sampling_rate=100)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "33c77509-7aab-4833-a372-16030941395d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Unnamed dataset - 35784 traces\n"
]
}
],
"source": [
"print(data)"
]
},
{
@@ -406,17 +428,17 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"id": "1753f65e-fe5d-4cfa-ab42-ae161ac4a253",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<matplotlib.lines.Line2D at 0x7f7ed04a8820>"
"<matplotlib.lines.Line2D at 0x14d6c12d0>"
]
},
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
},
@@ -449,7 +471,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"id": "bf7dae75-c90b-44f8-a51d-44e8abaaa3c3",
"metadata": {},
"outputs": [
@@ -472,7 +494,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "de82db24-d983-4592-a0eb-f96beecb2f69",
"metadata": {},
"outputs": [
@@ -622,29 +644,29 @@
"</div>"
],
"text/plain": [
" index source_origin_time source_latitude_deg source_longitude_deg \n",
"0 0 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \\\n",
" index source_origin_time source_latitude_deg source_longitude_deg \\\n",
"0 0 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \n",
"1 1 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \n",
"2 2 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \n",
"3 3 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \n",
"4 4 2020-01-01 10:09:42.200 5.702646e+06 5.582503e+06 \n",
"\n",
" source_depth_km source_magnitude split station_network_code station_code \n",
"0 0.7 2.469231 train PL BRDW \\\n",
" source_depth_km source_magnitude split station_network_code station_code \\\n",
"0 0.7 2.469231 train PL BRDW \n",
"1 0.7 2.469231 train PL BRDW \n",
"2 0.7 2.469231 train PL GROD \n",
"3 0.7 2.469231 train PL GROD \n",
"4 0.7 2.469231 train PL GUZI \n",
"\n",
" trace_channel trace_sampling_rate_hz trace_start_time \n",
"0 EHE 100.0 2020-01-01T10:09:36.480000Z \\\n",
" trace_channel trace_sampling_rate_hz trace_start_time \\\n",
"0 EHE 100.0 2020-01-01T10:09:36.480000Z \n",
"1 EHE 100.0 2020-01-01T10:09:36.480000Z \n",
"2 EHE 100.0 2020-01-01T10:09:36.480000Z \n",
"3 EHE 100.0 2020-01-01T10:09:36.480000Z \n",
"4 CNE 100.0 2020-01-01T10:09:36.476000Z \n",
"\n",
" trace_Pg_arrival_sample trace_name trace_Sg_arrival_sample \n",
"0 792.0 bucket0$0,:3,:2001 NaN \\\n",
" trace_Pg_arrival_sample trace_name trace_Sg_arrival_sample \\\n",
"0 792.0 bucket0$0,:3,:2001 NaN \n",
"1 NaN bucket0$1,:3,:2001 921.0 \n",
"2 872.0 bucket0$2,:3,:2001 NaN \n",
"3 NaN bucket0$3,:3,:2001 1017.0 \n",
@@ -658,7 +680,7 @@
"4 ZNE "
]
},
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -700,7 +722,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.10.6"
}
},
"nbformat": 4,

19
utils/convert_data.sh Normal file
View File

@@ -0,0 +1,19 @@
#!/bin/bash
#SBATCH --job-name=mseeds_to_seisbench
#SBATCH --time=1:00:00
#SBATCH --account=plgeposai22gpu-gpu
#SBATCH --partition plgrid
#SBATCH --cpus-per-task=1
#SBATCH --ntasks-per-node=1
#SBATCH --mem=24gb
## activate conda environment
source /net/pr2/projects/plgrid/plggeposai/kmilian/mambaforge/bin/activate
conda activate epos-ai-train
input_path="/net/pr2/projects/plgrid/plggeposai/datasets/bogdanka"
catalog_path="/net/pr2/projects/plgrid/plggeposai/datasets/bogdanka/BOIS_all.xml"
output_path="/net/pr2/projects/plgrid/plggeposai/kmilian/platform-demo-scripts/datasets/bogdanka/seisbench_format"
python mseeds_to_seisbench.py --input_path $input_path --catalog_path $catalog_path --output_path $output_path

View File

@@ -0,0 +1,250 @@
import os
import pandas as pd
import glob
from pathlib import Path
import obspy
from obspy.core.event import read_events
import seisbench
import seisbench.data as sbd
import seisbench.util as sbu
import sys
import logging
import argparse
logging.basicConfig(filename="output.out",
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
datefmt='%H:%M:%S',
level=logging.DEBUG)
logger = logging.getLogger('converter')
def create_traces_catalog(directory, years):
for year in years:
directory = f"{directory}/{year}"
files = glob.glob(directory)
traces = []
for i, f in enumerate(files):
st = obspy.read(f)
for tr in st.traces:
# trace_id = tr.id
# start = tr.meta.starttime
# end = tr.meta.endtime
trs = pd.Series({
'trace_id': tr.id,
'trace_st': tr.meta.starttime,
'trace_end': tr.meta.endtime,
'stream_fname': f
})
traces.append(trs)
traces_catalog = pd.DataFrame(pd.concat(traces)).transpose()
traces_catalog.to_csv("data/bogdanka/traces_catalog.csv", append=True, index=False)
def split_events(events, input_path):
logger.info("Splitting available events into train, dev and test sets ...")
events_stats = pd.DataFrame()
events_stats.index.name = "event"
for i, event in enumerate(events):
#check if mseed exists
actual_picks = 0
for pick in event.picks:
trace_params = get_trace_params(pick)
trace_path = get_trace_path(input_path, trace_params)
if os.path.isfile(trace_path):
actual_picks += 1
events_stats.loc[i, "pick_count"] = actual_picks
events_stats['pick_count_cumsum'] = events_stats.pick_count.cumsum()
train_th = 0.7 * events_stats.pick_count_cumsum.values[-1]
dev_th = 0.85 * events_stats.pick_count_cumsum.values[-1]
events_stats['split'] = 'test'
for i, event in events_stats.iterrows():
if event['pick_count_cumsum'] < train_th:
events_stats.loc[i, 'split'] = 'train'
elif event['pick_count_cumsum'] < dev_th:
events_stats.loc[i, 'split'] = 'dev'
else:
break
return events_stats
def get_event_params(event):
origin = event.preferred_origin()
if origin is None:
return {}
# print(origin)
mag = event.preferred_magnitude()
source_id = str(event.resource_id)
event_params = {
"source_id": source_id,
"source_origin_uncertainty_sec": origin.time_errors["uncertainty"],
"source_latitude_deg": origin.latitude,
"source_latitude_uncertainty_km": origin.latitude_errors["uncertainty"],
"source_longitude_deg": origin.longitude,
"source_longitude_uncertainty_km": origin.longitude_errors["uncertainty"],
"source_depth_km": origin.depth / 1e3,
"source_depth_uncertainty_km": origin.depth_errors["uncertainty"] / 1e3 if origin.depth_errors[
"uncertainty"] is not None else None,
}
if mag is not None:
event_params["source_magnitude"] = mag.mag
event_params["source_magnitude_uncertainty"] = mag.mag_errors["uncertainty"]
event_params["source_magnitude_type"] = mag.magnitude_type
event_params["source_magnitude_author"] = mag.creation_info.agency_id if mag.creation_info is not None else None
return event_params
def get_trace_params(pick):
net = pick.waveform_id.network_code
sta = pick.waveform_id.station_code
trace_params = {
"station_network_code": net,
"station_code": sta,
"trace_channel": pick.waveform_id.channel_code,
"station_location_code": pick.waveform_id.location_code,
"time": pick.time
}
return trace_params
def find_trace(pick_time, traces):
for tr in traces:
if pick_time > tr.stats.endtime:
continue
if pick_time >= tr.stats.starttime:
# print(pick_time, " - selected trace: ", tr)
return tr
logger.warning(f"no matching trace for peak: {pick_time}")
return None
def get_trace_path(input_path, trace_params):
year = trace_params["time"].year
day_of_year = pd.Timestamp(str(trace_params["time"])).day_of_year
net = trace_params["station_network_code"]
station = trace_params["station_code"]
tr_channel = trace_params["trace_channel"]
path = f"{input_path}/{year}/{net}/{station}/{tr_channel}.D/{net}.{station}..{tr_channel}.D.{year}.{day_of_year}"
return path
def load_trace(input_path, trace_params):
trace_path = get_trace_path(input_path, trace_params)
trace = None
if not os.path.isfile(trace_path):
logger.w(trace_path + " not found")
else:
stream = obspy.read(trace_path)
if len(stream.traces) > 1:
trace = find_trace(trace_params["time"], stream.traces)
elif len(stream.traces) == 0:
logger.warning(f"no data in: {trace_path}")
else:
trace = stream.traces[0]
return trace
def load_stream(input_path, trace_params, time_before=60, time_after=60):
trace_path = get_trace_path(input_path, trace_params)
sampling_rate, stream = None, None
pick_time = trace_params["time"]
if not os.path.isfile(trace_path):
print(trace_path + " not found")
else:
stream = obspy.read(trace_path)
stream = stream.slice(pick_time - time_before, pick_time + time_after)
if len(stream.traces) == 0:
print(f"no data in: {trace_path}")
else:
sampling_rate = stream.traces[0].stats.sampling_rate
return sampling_rate, stream
def convert_mseed_to_seisbench_format(input_path, catalog_path, output_path):
"""
Convert mseed files to seisbench dataset format
:param input_path: folder with mseed files
:param catalog_path: path to events catalog in quakeml format
:param output_path: folder to save seisbench dataset
:return:
"""
logger.info("Loading events catalog ...")
events = read_events(catalog_path)
events_stats = split_events(events, input_path)
metadata_path = output_path + "/metadata.csv"
waveforms_path = output_path + "/waveforms.hdf5"
logger.debug("Catalog loaded, starting conversion ...")
with sbd.WaveformDataWriter(metadata_path, waveforms_path) as writer:
writer.data_format = {
"dimension_order": "CW",
"component_order": "ZNE",
}
for i, event in enumerate(events):
logger.debug(f"Converting {i} event")
event_params = get_event_params(event)
event_params["split"] = events_stats.loc[i, "split"]
for pick in event.picks:
trace_params = get_trace_params(pick)
sampling_rate, stream = load_stream(input_path, trace_params)
if stream is None:
continue
actual_t_start, data, _ = sbu.stream_to_array(
stream,
component_order=writer.data_format["component_order"],
)
trace_params["trace_sampling_rate_hz"] = sampling_rate
trace_params["trace_start_time"] = str(actual_t_start)
pick_time = obspy.core.utcdatetime.UTCDateTime(trace_params["time"])
pick_idx = (pick_time - actual_t_start) * sampling_rate
trace_params[f"trace_{pick.phase_hint}_arrival_sample"] = int(pick_idx)
writer.add_trace({**event_params, **trace_params}, data)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert mseed files to seisbench format')
parser.add_argument('--input_path', type=str, help='Path to mseed files')
parser.add_argument('--catalog_path', type=str, help='Path to events catalog in quakeml format')
parser.add_argument('--output_path', type=str, help='Path to output files')
args = parser.parse_args()
convert_mseed_to_seisbench_format(args.input_path, args.catalog_path, args.output_path)

230
utils/utils.py Normal file
View File

@@ -0,0 +1,230 @@
import os
import pandas as pd
import glob
from pathlib import Path
import obspy
from obspy.core.event import read_events
import seisbench.data as sbd
import seisbench.util as sbu
import sys
import logging
logging.basicConfig(filename="output.out",
filemode='a',
format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s',
datefmt='%H:%M:%S',
level=logging.DEBUG)
logger = logging.getLogger('converter')
def create_traces_catalog(directory, years):
for year in years:
directory = f"{directory}/{year}"
files = glob.glob(directory)
traces = []
for i, f in enumerate(files):
st = obspy.read(f)
for tr in st.traces:
# trace_id = tr.id
# start = tr.meta.starttime
# end = tr.meta.endtime
trs = pd.Series({
'trace_id': tr.id,
'trace_st': tr.meta.starttime,
'trace_end': tr.meta.endtime,
'stream_fname': f
})
traces.append(trs)
traces_catalog = pd.DataFrame(pd.concat(traces)).transpose()
traces_catalog.to_csv("data/bogdanka/traces_catalog.csv", append=True, index=False)
def split_events(events, input_path):
logger.info("Splitting available events into train, dev and test sets ...")
events_stats = pd.DataFrame()
events_stats.index.name = "event"
for i, event in enumerate(events):
#check if mseed exists
actual_picks = 0
for pick in event.picks:
trace_params = get_trace_params(pick)
trace_path = get_trace_path(input_path, trace_params)
if os.path.isfile(trace_path):
actual_picks += 1
events_stats.loc[i, "pick_count"] = actual_picks
events_stats['pick_count_cumsum'] = events_stats.pick_count.cumsum()
train_th = 0.7 * events_stats.pick_count_cumsum.values[-1]
dev_th = 0.85 * events_stats.pick_count_cumsum.values[-1]
events_stats['split'] = 'test'
for i, event in events_stats.iterrows():
if event['pick_count_cumsum'] < train_th:
events_stats.loc[i, 'split'] = 'train'
elif event['pick_count_cumsum'] < dev_th:
events_stats.loc[i, 'split'] = 'dev'
else:
break
return events_stats
def get_event_params(event):
origin = event.preferred_origin()
if origin is None:
return {}
# print(origin)
mag = event.preferred_magnitude()
source_id = str(event.resource_id)
event_params = {
"source_id": source_id,
"source_origin_uncertainty_sec": origin.time_errors["uncertainty"],
"source_latitude_deg": origin.latitude,
"source_latitude_uncertainty_km": origin.latitude_errors["uncertainty"],
"source_longitude_deg": origin.longitude,
"source_longitude_uncertainty_km": origin.longitude_errors["uncertainty"],
"source_depth_km": origin.depth / 1e3,
"source_depth_uncertainty_km": origin.depth_errors["uncertainty"] / 1e3 if origin.depth_errors[
"uncertainty"] is not None else None,
}
if mag is not None:
event_params["source_magnitude"] = mag.mag
event_params["source_magnitude_uncertainty"] = mag.mag_errors["uncertainty"]
event_params["source_magnitude_type"] = mag.magnitude_type
event_params["source_magnitude_author"] = mag.creation_info.agency_id if mag.creation_info is not None else None
return event_params
def get_trace_params(pick):
net = pick.waveform_id.network_code
sta = pick.waveform_id.station_code
trace_params = {
"station_network_code": net,
"station_code": sta,
"trace_channel": pick.waveform_id.channel_code,
"station_location_code": pick.waveform_id.location_code,
"time": pick.time
}
return trace_params
def find_trace(pick_time, traces):
for tr in traces:
if pick_time > tr.stats.endtime:
continue
if pick_time >= tr.stats.starttime:
# print(pick_time, " - selected trace: ", tr)
return tr
logger.warning(f"no matching trace for peak: {pick_time}")
return None
def get_trace_path(input_path, trace_params):
year = trace_params["time"].year
day_of_year = pd.Timestamp(str(trace_params["time"])).day_of_year
net = trace_params["station_network_code"]
station = trace_params["station_code"]
tr_channel = trace_params["trace_channel"]
path = f"{input_path}/{year}/{net}/{station}/{tr_channel}.D/{net}.{station}..{tr_channel}.D.{year}.{day_of_year}"
return path
def load_trace(input_path, trace_params):
trace_path = get_trace_path(input_path, trace_params)
trace = None
if not os.path.isfile(trace_path):
logger.w(trace_path + " not found")
else:
stream = obspy.read(trace_path)
if len(stream.traces) > 1:
trace = find_trace(trace_params["time"], stream.traces)
elif len(stream.traces) == 0:
logger.warning(f"no data in: {trace_path}")
else:
trace = stream.traces[0]
return trace
def load_stream(input_path, trace_params, time_before=60, time_after=60):
trace_path = get_trace_path(input_path, trace_params)
sampling_rate, stream = None, None
pick_time = trace_params["time"]
if not os.path.isfile(trace_path):
print(trace_path + " not found")
else:
stream = obspy.read(trace_path)
stream = stream.slice(pick_time - time_before, pick_time + time_after)
if len(stream.traces) == 0:
print(f"no data in: {trace_path}")
else:
sampling_rate = stream.traces[0].stats.sampling_rate
return sampling_rate, stream
def convert_mseed_to_seisbench_format():
input_path = "/net/pr2/projects/plgrid/plggeposai"
logger.info("Loading events catalog ...")
events = read_events(input_path + "/BOIS_all.xml")
events_stats = split_events(events)
output_path = input_path + "/seisbench_format"
metadata_path = output_path + "/metadata.csv"
waveforms_path = output_path + "/waveforms.hdf5"
with sbd.WaveformDataWriter(metadata_path, waveforms_path) as writer:
writer.data_format = {
"dimension_order": "CW",
"component_order": "ZNE",
}
for i, event in enumerate(events):
logger.debug(f"Converting {i} event")
event_params = get_event_params(event)
event_params["split"] = events_stats.loc[i, "split"]
# b = False
for pick in event.picks:
trace_params = get_trace_params(pick)
sampling_rate, stream = load_stream(input_path, trace_params)
if stream is None:
continue
actual_t_start, data, _ = sbu.stream_to_array(
stream,
component_order=writer.data_format["component_order"],
)
trace_params["trace_sampling_rate_hz"] = sampling_rate
trace_params["trace_start_time"] = str(actual_t_start)
pick_time = obspy.core.utcdatetime.UTCDateTime(trace_params["time"])
pick_idx = (pick_time - actual_t_start) * sampling_rate
trace_params[f"trace_{pick.phase_hint}_arrival_sample"] = int(pick_idx)
writer.add_trace({**event_params, **trace_params}, data)
if __name__ == "__main__":
convert_mseed_to_seisbench_format()
# create_traces_catalog("/net/pr2/projects/plgrid/plggeposai/", ["2018", "2019"])