#!/usr/bin/env python
"""
Script name: utilities.py
Description:
File containing utilities
"""
try:
import cv2
except:
pass
import json
import numpy as np
import pandas as pd
import os
import matplotlib
from sklearn import mixture
from sklearn.decomposition import PCA
import h5py
import time
import random as rd
import xml.etree.ElementTree as ET # for XML file
import sys
from math import floor
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import concurrent.futures
import itertools as itrtls
from math import factorial
from scipy.stats import ttest_ind, binom, binom_test, chi2, beta
from sklearn.utils import shuffle as sklearn_shuffle
import scipy.integrate as integrate
try:
from gwpy.timeseries import TimeSeries, TimeSeriesDict
from gwpy.segments import DataQualityFlag
from gwpy.table import EventTable
from gwpy.table.io.pycbc import filter_empty_files as filter_pycbc_live_files
from gwpy.time import tconvert
except ImportError:
print('There is not gwpy module yet')
try:
from gwtrigfind import find_trigger_files
except ModuleNotFoundError:
raise ValueError("gwtrigfind has not been installed yet. please do pip install gwtrigfind")
matplotlib.use("Agg")
# matplotlib.rcParams["backend"] = "TkAgg"
# Test display
# try:
# matplotlib.rcParams["backend"] = "TkAgg"
# m_dis = os.environ["DISPLAY"]
# except KeyError:
# print("Display problems. Set to Agg")
# matplotlib.use('Agg')
#
import matplotlib.pyplot as plt
plt.rc('text', usetex=False) # this is used for fixing a gwpy compatible issue
from matplotlib.colors import LogNorm
from matplotlib.ticker import MaxNLocator
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
import origli.utilities.const as Const # for multi frequency bands defined in
__author__ = "Kentaro Mogushi"
__copyright__ = "Copyright 2019-2025, Kentaro Mogushi"
__credits__ = ["Line for credits"]
__license__ = "GPL"
__version__ = "0.1.0"
__maintainer__ = "K. Mogushi"
__email__ = "kmhhz@umsystem.edu"
__status__ = "Developing"
# def convert_imageset_to_pandas(m_indexfile):
# """
#
# :param m_indexfile:
# :return:
# """
#
# df_in = pd.read_csv(m_indexfile)
# label_list=[]
# image_list=None
#
# for ii in range(len(df_in)):
# m_img_array,h,w,d = image_to_1d(df_in.iloc[ii]["id"])
#
# m_img_array = np.insert(m_img_array,0,d,axis=1)
# m_img_array = np.insert(m_img_array,0,w,axis=1)
# m_img_array = np.insert(m_img_array,0,h,axis=1)
#
# try:
# image_list = np.concatenate((image_list,m_img_array),axis=0)
# except ValueError:
# image_list = m_img_array
# label_list.append(df_in.loc[ii]["label"])
#
#
# label_list = np.asarray(label_list)
#
# #build the labeling
# col_names =["h","w","d"]
# for hi in range(h):
# for wi in range(w):
# for di in range(d):
# col_names.append("px-h%s-w%s-d%s" % (hi,wi,di))
#
#
# image_df = pd.DataFrame(data=image_list,columns=col_names)
# label_df = pd.DataFrame(data=label_list,columns=["label"])
#
# out_df = label_df.join(image_df)
#
# if len(out_df)!=len(df_in):
# raise RuntimeError("Input and output numer of images does not match!")
#
# #if m_output_name is not None:
# # out_df.to_csv(m_output_name)#,columns=["label",col_names])
# # out_df.to_hdf(m_output_name,key="df", mode='w', complevel=9, complib="bzip2")
# return out_df
#
# def image_to_1d(m_input_image_filename):
# """
#
# :param m_input_image:
# :return:
# """
#
# m_input_image = cv2.imread(m_input_image_filename)
#
# h = m_input_image.shape[0]
# w = m_input_image.shape[1]
# d = m_input_image.shape[2]
# cv2.destroyAllWindows()
#
# return m_input_image.reshape(1,h*w*d),h,w,d
#
# def read_json_file(m_filename):
# m_out_dict = json.loads(open(m_filename).read())
#
# return m_out_dict
#
#
#
###################################################
# grab the GPS times of a glitch for a certain ifo
###################################################
[docs]def GrabGPStimesSafechannel(fileid, ifo, SNRthre, glitch, PathSafeChannel, Epochstart, Epochend, Commissioning_lt=None):
'''
description:
This imports a file containing the output of GravitySpy
This take the GPS times during O2 run
and take the list of the safe channels and modify it such as it works in gwpy
USAGE: GPSs, ids ,re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/GravityspyTrainingset/gspy-db-20180813.csv', 'L1', 7, 'Blip', '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')
:param fileid: a file that contains all the meta data of glitches used for training set for GravitySpy
:param ifo: a kind of interferometer {L1, H1, V1}
:param SNRthre: the minimum threshold of SNR, e.g, 7
:param glitch: a kind of glitch
:param PathSafeChannel: the full path of the file of the metadata
:param Epochstart: GPS time when a science run begins, float or int
:param Epochend: GPS time when a science run ends, float or int
:param Commissioning_lt: the set of commissioning times in a list of lists, e.g., [[Cstart1, Cend1], [Cstart2, Cend2]], None in defalt
:return
GPSs: a list of GPS timse
ids: a list of unique id
re_sfchs: a list of safe channels
'''
# import the data file in pandas format
df = pd.read_csv(fileid)
# data set of a gitch 'label' seen ifo with SNR is greater than SNRthre
df_int = df[(df['ifo']==ifo) & (df['label']==glitch) & (df['snr']>=SNRthre)]
# restrict the data within O2 run
# within the beginning and end of the run
df_int = df_int[(df_int['GPStime'] > Epochstart) & (df_int['GPStime'] < Epochend)]
# removing the time of commissioning during the science run
if Commissioning_lt !=None: # if there is (are) commissioning
for Commissioning in Commissioning_lt:
Commissioning_start, Commissioning_end = Commissioning[0], Commissioning[1]
df_int = df_int[(df_int['GPStime'] < Commissioning_start) | (df_int['GPStime'] > Commissioning_end)]
# the list of GPS times
GPSs = np.array(df_int['GPStime'].tolist())
# the list of uniqu ID
ids = np.array(df_int['id'].tolist())
# take the list of the safe channel
sfchs = pd.read_csv(PathSafeChannel, names=['channel'])
# revise the name of the safe channels to mathc gwpy
# fist convert the list of the same channels to the numpy format for the sake of convenience
sfchs = np.array(sfchs['channel'])
# make the empty list for the renamed channels
re_sfchs = []
for i in sfchs:
# split it two parts before and after '_' appearing the first time from the left
splited_name = i.split('_', 1)
# reconstructed name
re_name = '%s-%s' % (splited_name[0], splited_name[1])
# fill the name
re_sfchs.append(re_name)
re_sfchs = np.array(re_sfchs)
# retrun the list of GPS times, the ids, the name of the safe channels in numpy array format
return GPSs, ids, re_sfchs
###################################################
# grab the list of the safe channels in IFO
###################################################
[docs]def GrabSafechannel(PathSafeChannel):
'''
description:
take the list of the safe channels and modify it such as it works in gwpy
USAGE: re_sfchs = GrabGPStimesSafechannel('/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')
:param :PathSafeChannel: the full path of the file of the metadata
:return
re_sfchs: a list of safe channels
'''
# take the list of the safe channel
sfchs = pd.read_csv(PathSafeChannel, names=['channel'])
# revise the name of the safe channels to mathc gwpy
# fist convert the list of the same channels to the numpy format for the sake of convenience
sfchs = np.array(sfchs['channel'])
# sort the list in alphabetical order
sfchs.sort()
# make the empty list for the renamed channels
re_sfchs = []
for i in sfchs:
# split it two parts before and after '_' appearing the first time from the left
splited_name = i.split('_', 1)
# reconstructed name
re_name = '%s-%s' % (splited_name[0], splited_name[1])
# fill the name
re_sfchs.append(re_name)
re_sfchs = np.array(re_sfchs)
# retrun the list of GPS times, the ids, the name of the safe channels in numpy array format
return re_sfchs
###################################################
# remove the channels that are not used in O2
####################################################
[docs]def RemoveChannelUnused(re_sfchs, PathListChannelUnused):
'''
Description:
K.M find that some of the channels in the list of the safe channels are not used in O2
so that gwpy can not get the time series of those channels.
This function remove those channels.
USAGE:
re_sfchs = RemoveChannelUnused(re_sfchs, '/home/kentaro.mogushi/longlived/MachineLearningJointPisaUM/dataset/ListSaveChannel/L1/O2_omicron_channel_list_hvetosafe_GDS.txt')
:param re_sfchs: the list the safe channels in the numpy array format
:return re_sfchs: the list the safe channels without unused channels in the numpy array format
'''
# remove GDS-CALIB_STRAIN as it is not the safe channel
re_sfchs = re_sfchs[re_sfchs!='GDS-CALIB_STRAIN']
UnusedChannels = np.loadtxt(PathListChannelUnused, dtype='str')
if UnusedChannels.shape[0] != 0:
for unusedChannel in UnusedChannels:
re_sfchs = re_sfchs[re_sfchs!=unusedChannel]
else:
pass
########## remove noisemon channels #############
for ChannelName in re_sfchs:
try:
if ChannelName.split('-')[1].split('_')[2] == 'NOISEMON':
re_sfchs = re_sfchs[re_sfchs!=ChannelName]
except IndexError:
pass
######################
return re_sfchs
[docs]def ListUsedSafeChannel(path_list_channel, ifo):
'''
description:
1. take a path to a .csv file that has lists of channels
2. remove unused safe channel from a list of safe channels
USAGE: sfchs = ListUsedSafeChannel(path_list_channel, ifo)
:param path_list_channel: a path to a list of channels (.csv file)
:param ifo: observatory {L1, H1} in str
:return: sfchs: subst of safe channels in numpy array
'''
ListChannel_df = pd.read_csv(path_list_channel, names=['H1ListSafeChannel', 'H1ListUnusedChannel', 'L1ListSafeChannel','L1UnusedChannel'])
sfchs = np.array(ListChannel_df['{}ListSafeChannel'.format(ifo)].tolist())
UnusedChannels = np.array(ListChannel_df['{}ListUnusedChannel'.format(ifo)].tolist())
for unusedChannel in UnusedChannels:
sfchs = sfchs[sfchs != unusedChannel]
return sfchs
#
# #######################################
# # take the subset of the channels
# #######################################
#
# def TakeSubsetList(list, subset_num, cor_list=[None]):
# '''
#
# :param list: an original list
# :param subset_num: the size of the output list which is randomly chosen a subset of the original list
# :param cor_list: a corresponding list if it is related to the original list (e.g., IDs of the list GPSs given by GravitySpy)
# :return: a reduced list, a corresponding list
# '''
# print(cor_list[0])
# if cor_list[0] == None:
# if len(list) > subset_num: # if the size of an original list is greater than the size of the subset
# list_subset = np.array(rd.sample(list, subset_num))
# else: # if not
# list_subset = list
# return list_subset
# else:
# if len(list) > subset_num: # if the size of an original list is greater than the size of the subset
# list_subset = np.array(rd.sample(list, subset_num))
# else: # if not
# list_subset = list
# # the index having the elements of the subset in list
# ind_subset = [np.where(list==i)[0][0] for i in list_subset]
# # the subset of cor_list
# cor_list_subset = cor_list[ind_subset]
# return list_subset, cor_list_subset
#
# ################################################################
# # write out the output of the safe channels around GPS times
# ###############################################################
#
# def SaveTimeseriesHdf5(GPSs, ids, re_sfchs, ifo, wd, outputpath, outputfilename):
# '''
# This extracts the output of the safe channels around the GPS times
# input:
# GPSs: the list of GPS times for a given glitch
# ids: the list of unique IDs labeled GravitySpy
# re_sfchs: the list the safe channels
# ifo: kind of interferomter {'H1', 'L1', 'V1'}
# wd: forward and backward time window around the peak GPS time
# outputpath: Path of an output directory
# outputfilename: the name of a output file where the file stores the time series of a glitch
# output:
# hdf5 file
# data structure: group: "gps0" has dataset "data0", "data1", ...
# dataset: "data0" has times series of every safe channels
# dataset attributes: "GPS", "channel", "SamplingRate"
# description:
# 1- if there is a group about a GPS exists
# 1- if there are full channel data sets stored in this group
# 1- if there actually time series stored in each data set
# -> skip this group and go to the next GPS iteration
# 2- if there is at least an empty data set in this group
# -> delete this group and re-create this group, then extract data sets from the server
# 2- if there are no or not enough data sets stored in this group
# -> delete this group and re-create this group, then extract data sets from the server
#
# USAGE:write_output_SafeChannel('test.')
# '''
# # the path and the file name
# ouputPathFileName = os.path.join(outputpath, outputfilename)
# # make a log file
# logFileName = '{}LOG.txt'.format(outputfilename.split('.')[0]) # log file name
# logfilePathName = os.path.join(outputpath, logFileName) # # path and the file name of the log file
# logfileobj = open(logfilePathName, 'w') # Truncate file to zero length or create text file for writing
#
# # looping with GPS times and the list of id
# for group_id, (GPS, id)in enumerate(zip(GPSs, ids), 0):
# try: # create a file in hdf5 format it there is no such a file
# f = h5py.File(ouputPathFileName, 'a')
# except IOError:
# f = h5py.File(ouputPathFileName, 'r+')
# # note, group_id is used as a label for a group in hdf5 file
# # make a group in the file as, for example, gps00001
# try: # create a group if it does not exist
# g = f.create_group("gps{:05d}".format(group_id))
# # add attributes
# g.attrs['GPS'] = GPS
# g.attrs['ifo'] = ifo
# g.attrs['id'] = id
# except ValueError: # this group already exists
# # read this group
# g = f["gps{:05d}".format(group_id)]
# # check there are every safe channels data stored in this group
# # check if there are all the channels in this group (e.g., 815)
# if len(g.keys()) == len(re_sfchs): # yes, there are data sets with the same number of the channels
# # check if there are actually time series stored in this dataset
# for dset_name in g.keys():
# if g['{}'.format(dset_name)].value.shape[0] == 0: # no, some dataset is empty
# argu = 'there is at least an empty dataset' # for the next evaluation
# break # get out this loop
# else: # yes, all the data sets are filled with data
# argu = 'there are full datasets' # for the next evaluation
# pass
# if argu == 'there is at least one empty dataset':
# del f['gps{:05d}'.format(group_id)] # delete this group
# elif argu == 'there are full datasets':
# print('group{:05d} out of {} groups: there are full datasets around GPS time {}'.format(group_id, len(GPSs),GPS))
# log1 = 'group{:05d} out of {} groups at GPS time {} was already done. ID:{}'.format(group_id, len(GPSs),GPS, id)
# logfileobj.write(log1) # take a log
# logfileobj.write('\n') # to the next line
# continue # go to the next GPS iteration
# else: # no, there are less or no data sets with the same number of the channels
# del f['gps{:05d}'.format(group_id)] # delete this group
# try: # re-create this group
# g = f.create_group("gps{:05d}".format(group_id))
# g.attrs['GPS'] = GPS
# g.attrs['ifo'] = ifo
# g.attrs['id'] = id
# except ValueError:
# pass
#
# # the list of channel names
# chnames = ['{0}:{1}'.format(ifo, i) for i in re_sfchs]
# # start time
# tstart = GPS - wd
# # end time
# tend = GPS + wd
# try: # extract time series from the server
# tbeg = time.time()
# print('group{:05d} out of {} groups : taking data set around GPS time {} ...'.format(group_id, len(GPSs), GPS))
# FrameType = '{}_R'.format(ifo)
# data = TimeSeriesDict.get(chnames, tstart, tend, frametype=FrameType, verbose=True)
# tfin = time.time()
# print('It took {0} sec for getting data at {1}'.format(float(tfin-tbeg), GPS))
# # make log
# log2 = 'group{:05d} out of {} groups at GPS time {} is now done. ID:{}'.format(group_id, len(GPSs),GPS, id)
# logfileobj.write(log2) # take a log
# logfileobj.write('\n') # go to the new line
# except IndexError:
# # this gives, e.g., IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ISI-GND_STS_ETMY_X_DQ offset: 1153856716)
# log3 = 'IndexError: group{:05d} out of {} groups st GPS time {} was SKIPPED. ID:{}'.format(group_id, len(GPSs), GPS, id)
# print(log3)
# logfileobj.write(log3) # take a log
# logfileobj.write('\n') # go to the new line
# continue # go to the next GPS iteration
# # store the data
# for dset_id, channel in enumerate(chnames, 0):
# # time series of the data
# ts = data[channel].value
# # sampling rate in Hz
# samplingrate = data[channel].sample_rate.value
# # store the time series with compression
# dset = g.create_dataset("ch{:05d}".format(dset_id), data=ts, compression="gzip", compression_opts=9)
# # make attributes on the data set 'dset'
# dset.attrs['channel'] = channel.split(':')[1]
# dset.attrs['SamplingRate'] = samplingrate
# f.close()
# logfileobj.close()
# return None
#
#
[docs]def Multiprocess_ConvertToTable(cache_indiv, trigger_pipeline, IFO, Columns):
'''
description:
Multi process does not work if I include this inside the class IdentifyGlitch() so that I define this here globally
:param cache_indiv: an individual cache (each trigger file)
:param trigger_pipeline: a name of trigger pipeline {omicron, pycbc-live}
:param IFO: a name of the detector {L1, H1}
:param Columns: a list of columns for a metadata. This is None for omicron trigger as it is nother to do
:return: df_indiv: a metadata of triggers in pandas frame
'''
if trigger_pipeline == 'omicron':
try:
t = EventTable.read(cache_indiv, format='ligolw', tablename='sngl_burst')
df_indiv = t.to_pandas() # convert it to a pandas table
except KeyError:
df_indiv = pd.DataFrame()
except ValueError:
df_indiv = pd.DataFrame()
elif trigger_pipeline == 'pycbc-live':
try:
t = EventTable.read(cache_indiv, format='hdf5.pycbc_live', ifo='{}'.format(IFO), columns=Columns)
df_indiv = t.to_pandas() # convert it to pandas frame
except KeyError:
df_indiv = pd.DataFrame()
return df_indiv
[docs]class IdentifyGlitch():
def __init__(self):
pass
[docs] def FindObservingTimeSegments(self, IFO, startT, endT, outputMother_dir, state='observing'):
'''
description:
1. take a DataQualityFrag from a server
2. save its segment data as a HDF file
3. take active segments
4. return active segments as a numpy array
USAGE: SegmentsMat, trigger_dir = FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
:param IFO: a type of interferometer, 'L1' or 'H1'
:param startT: (float, int or string: e.g., 'Dec 8 2016') starting time
:param endT: ending time
:param outputMother_dir: an ouput directory where the HDF5 file will be stored
:param state: state of an interferometer, {observing, nominal-lock}
:return: SegmentsMat: active segments in a numpy array
'''
# choose a state channel
if state == 'observing':
StateChannel = 'DMT-ANALYSIS_READY:1'
elif state == 'nominal-lock':
StateChannel = 'DMT-GRD_ISC_LOCK_NOMINAL:1'
else:
print('Please a state of this interferometer either "observing" or "nominal-lock"')
sys.exit()
# take segments data from a server
segs = DataQualityFlag.query('{0}:{1}'.format(IFO, StateChannel), startT, endT)
# path of an output directory
trigger_dir = os.path.join(outputMother_dir, '{}_{}'.format(int(startT), int(endT)))
try:
os.mkdir(trigger_dir)
except OSError:
pass
# path of an output file
output_path = os.path.join(trigger_dir, 'DataQualityFlag.h5')
# save it as a HDF5 file
try:
segs.write(output_path)
except IOError: # if this file already exists
# delete the file
os.remove(output_path)
# re-create the file
segs.write(output_path)
# import this HDF5 file
segs_h5 = h5py.File(output_path, 'r')
# get active segments
Active_segs = segs_h5['{0}:{1}'.format(IFO, StateChannel)]['active'][()]
SegmentsMat = np.zeros((Active_segs.size, 2))
for ind, seg in enumerate(Active_segs, 0):
SegmentsMat[ind, 0] = seg[0] # starting time
SegmentsMat[ind, 1] = seg[2] # ending time
SegmentsMat = SegmentsMat.astype(int)
return SegmentsMat, trigger_dir
[docs] def OmimcronTriggerPath(self, ListSegments, TriggerDir, IFO, output_file='omicron.txt', channel='GDS-CALIB_STRAIN'):
'''
description:
1. log-in either Livingston or Hanford cluster
2. take the path of files storing omicron triggers during a segment
3. save those paths into an output file
USAGE: OmimcronTriggerPath('L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger', IFO)
:param IFO: interferometer (L1, or H1)
:param ListSegments: (list of lists) [[s1, e1], [s2, e2], ...], this is because I want to exclude segments of non-observing times
:param channel: (str) the name of a channel
:param trigger_dir: (str) an output directory
:param output_file: (str) an output file
:param: IFO:
:return: trigger_dir
'''
output_path = os.path.join(TriggerDir, output_file)
for ObservingSegment in ListSegments:
startT, endT = ObservingSegment[0], ObservingSegment[1]
# save paths of all the omicron trigger files between startT and eneT to output_path
os.system('/home/detchar/opt/gwpysoft-2.7/bin/trigfind {0}:{1} omicron {2} {3} >> {4}'.format(IFO, channel, startT, endT, output_path))
print('Done for getting a list of paths of trigger files')
return None
[docs] def CopyOmicroTriggerandUnzip(self, input_file, input_dir, TriggerDir, output_dir='OmicronTriggerXML'):
'''
description:
1. load a file comprising all the paths of omicron trigger files you are interested in
2. copy all the omicron trigger file in your working place
3. go to an output directory
4. unzip all the files and replace them with the zipped files
5. go back to a working directory
USAGE: CopyOmicroTriggerandUnzip('omicron.txt', trigger_dir, trigger_dir)
:param input_file: an input file
:param input_dir: an input directory, in default, a current directory
:param trigger_dir: # a directory right above an output directory
:param output_dir: # an output directory where all the omicron trigger files will be stored
:return: None
'''
#++++++++++++++ load a file and copy ++++++++++++++++++++++++++
input_path = os.path.join(input_dir, input_file) # input path
f = np.loadtxt(input_path, dtype=np.str) # load an input file
output_path = os.path.join(TriggerDir, output_dir) # output path
try:
os.mkdir(output_path) # mkdir an output directory
except OSError:
pass # do nothing if the directory already exists
for i in f: # iterate through paths of omicron triggers stored in an input file
copy = 'cp {0} {1}'.format(i, output_path) # make the command line used in the next line
os.system(copy) # copy omicron trigger files into your working place
#+++++++++ unzip them +++++++++++++++++++++++++
os.chdir(output_path) # go to an output directory
os.system('for f in *.gz ; do gunzip -f "$f" > ./"${f%.*}" ; done') # unzip all the file
# -f replaces the original .xml.gz file with .xml
# -c unzip .xml.gz file and also keep the .xml file
os.chdir(TriggerDir) # go back to a working directory
return None
[docs] def readXML(self, input_dir, input_file):
'''
description:
1. take meta data stored s a .xml file in a directory named OmicronTriggerXML
2. meta data matrix in the form of list
USAGE: TriggerMat = readXML(input_dir, input_file)
:param input_dir: an input directory
:param input_file: input file name
:return: name_a: (numpy array)
'''
# read a XML file
path_input = os.path.join(input_dir, input_file) # input path
tree = ET.parse(path_input) # tree of an input file
root = tree.getroot() # root of an input file
#++++++++ get the omicron trigger's data set++++++++++++++++
g = []
for i in root[2]:
a = str(i.text)
g.append(a)
# trim meta data for the sake of convenience
h = np.array(g) # convert list object to numpy array
ind_nan = np.where(h == 'None') # find the indices where element is 'None'
h_new = np.delete(h, ind_nan) # without nan
a = h_new[0].split('\n\t\t\t') # split
a[-1] = a[-1].replace('\n\t\t', '') # remove the last as the last one is redundant
a_new = a[1:] # get rid of the first one which is redundant and make another array
# +++++++++ get the name of the data set (columns)++++++++++++++++
name_list = []
# name column names
for i in root[2]:
names = str(i.attrib)
name_list.append(names)
for idx in range(len(name_list)):
name_list[idx] = name_list[idx].split(':')[-1].replace("'}", "")
# remove the last one
#name_list = name_list[:-1]
#name_com = name_list[0] + ''
# combine every name
name_com = ''
for i in name_list:
name_com = name_com + i + ','
# remove the ',' at the end and make it as list
name_com = [name_com[:-1]]
# combine the name and numbers
name_a = name_com + a_new
return name_a
[docs] def CombinedIndentifyingProcess(self, IFO, ListSegments, TriggerDir):
'''
description:
1. use OmimcronTriggerPath() to log in either L1 or H1 cluster and get a list of trigger files paths
2. use CopyOmicroTriggerandUnzip() to copy trigger files and unzip them
3. iterate with trigger XML files
3-1. use readXML() to get mata data stored in a .xml file
3-2. use ExtractOmcronTriggerMetadata() to re-arrange meta data matrix
4. save the matrix as .csv file
5. copy the .csv file into the CIT cluster and go back to the CIT cluster
USAGE: CombinedIndentifyingProcess(self, 'L1', ListSegments, '/home/kentaro.mogushi/longlived/OmicronTrigger')
:param IFO: ifo {H1, L1, V1}
:param ListSegments: a list of segments
:param TriggerDir: a mother directory of a trigger file
:return: None
'''
print('Begin to get trigger meta data')
# get a list of trigger files paths
print('get trigger files paths')
self.OmimcronTriggerPath(ListSegments, TriggerDir, IFO)
# copy trigger files and unzip them
print('copy trigger files and unzip them')
self.CopyOmicroTriggerandUnzip('omicron.txt', TriggerDir, TriggerDir)
# path of trigger XML files
omicrontriggerXMLdir = os.path.join(TriggerDir, 'OmicronTriggerXML')
# a list of trigger files
ListTriggerFiles = os.listdir(omicrontriggerXMLdir)
AllMatData = np.array([])
print('creating meta data matrix')
for TrigerFile in ListTriggerFiles: # iterate with trigger files
TriggerMat = self.readXML(omicrontriggerXMLdir, TrigerFile) # get trigger meta data
Matdataset = self.ExtractOmcronTriggerMetadata(TriggerMat) # re-arrange this meta data
AllMatData = np.vstack([AllMatData, Matdataset]) if AllMatData.size else Matdataset # combine matrices
self.SaveMetaDataAsCSV(AllMatData, TriggerDir, 'OmicrontriggerMetadata.csv') # save the matrix as .csv file
print('Done!')
return None
[docs] def pyCBC_SNR_filter(self, all_pycbc_triggers_frame, SNR_low_cut, SNR_high_cut):
'''
description:
band pass filter with SNR for pycbc triggers
USAGE: df_SNR_cut = pyCBC_SNR_filter(all_pycbc_triggers_frame, SNR_low_cut=7.5, SNR_high_cut=150)
:param all_pycbc_triggers_frame: a pandas frame of all the pyCBC triggers
:param SNR_low_cut: a lower cutoff SNR
:param SNR_high_cut: a higher cutoff SNR
:return: df_new: pycbc triggers that pass the all the condition defined above
'''
df_new = all_pycbc_triggers_frame[(all_pycbc_triggers_frame['snr'] >= SNR_low_cut) & (all_pycbc_triggers_frame['snr'] < SNR_high_cut)]
return df_new
[docs] def pyCBC_template_duration_filter(self, pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration):
'''
description:
band pass filter with template duration for pycbc triggers
USAGE: df_template_duration_cut = pyCBC_template_duration_filter(pycbc_triggers_frame, low_cutoff_duration, high_cutoff_duration)
:param pycbc_triggers_frame: pyCBC triggers in pandas frame
:param low_cutoff_duration: a lower cutoff of the template duration in sec
:param high_cutoff_duration: a higher cutoff of the template duration in sec
:return: df_new: triggers with template druation between low_cutoff_duration and high_cutoff_duration
'''
df_new = pycbc_triggers_frame[(pycbc_triggers_frame['template_duration'] >= low_cutoff_duration) & (pycbc_triggers_frame['template_duration'] < high_cutoff_duration)]
return df_new
[docs] def pyCBC_totalmass_filter(self, pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass):
'''
description
band pass filter with total mass for pycbc triggers
USAGE: df_totalmass_cut = pyCBC_totalmass_filter(pycbc_triggers_frame, low_cutoff_totalmass, high_cutoff_totalmass)
:param pycbc_triggers_frame: pyCBC triggers in pandas frame
:param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass
:param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass
:return: triggers with total mass between low_cutoff_totalmass and high_cutoff_totalmass
'''
pycbc_triggers_frame = pycbc_triggers_frame.assign(totalmass=pycbc_triggers_frame.mass1+pycbc_triggers_frame.mass2)
df_new = pycbc_triggers_frame[(pycbc_triggers_frame['totalmass'] >= low_cutoff_totalmass) & (pycbc_triggers_frame['totalmass'] < high_cutoff_totalmass)]
return df_new
[docs] def pyCBC_massratio_filter(self, pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio):
'''
description
band pass filter with total mass for pycbc triggers
USAGE: df_massratio_cut = pyCBC_query_massratio(pycbc_triggers_frame, low_cutoff_massratio, high_cutoff_massratio)
:param pycbc_triggers_frame: pyCBC triggers in pandas frame
:param low_cutoff_totalmass: a lower cutoff of the total mass in solar mass
:param high_cutoff_totalmass: a higher cutoff of the total mass in solar mass
:return: df_new: triggers with mass ratio between low_cutoff_massratio and high_cutoff_massratio
'''
pycbc_triggers_frame = pycbc_triggers_frame.assign(massratio=pycbc_triggers_frame.mass1/pycbc_triggers_frame.mass2)
df_new = pycbc_triggers_frame[(pycbc_triggers_frame['massratio'] >= low_cutoff_massratio) & (pycbc_triggers_frame['massratio'] < high_cutoff_massratio)]
return df_new
[docs] def calculate_chisqr_weighted_snr(self, snr, chisq, chisq_dof):
'''
description:
calculate the chi-square weighted SNR
Reference: Macleod et al. 2015, Equation (21)
USAGE: chisqr_weighted_snr = calculte_chisqr_weighted_snr(self, snr, chisqr, chisqr_dof)
:param snr: SNR
:param chisqr: chi-square
:param chisqr_dof: chi-square degrees of freedom
:return: chisqr_weighted_snr: the chi-square weighted SNR
'''
if chisq <= chisq_dof:
chisq_weighted_snr = snr
elif chisq > chisq_dof:
chisq_weighted_snr = snr * ( (1 + ( chisq / chisq_dof ) ** 3 )/2. )**(-1/6.)
return chisq_weighted_snr
[docs] def pyCBC_chisqr_weighted_snr_filter(self, pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff):
'''
description
high pass filter with chi square weighted SNR for pycbc triggers
USAGE: df_chi_square_weighted_cut = pyCBC_chisqr_weighted_snr_filter(pycbc_triggers_frame, chisqr_weighted_snr_lower_cutoff)
:param pycbc_triggers_frame: pyCBC triggers in pandas frame
:param chisqr_weighted_snr_lower_cutoff: a lower cutoff chi-square weighted snr
:return: df_new: triggers with chi square weighted SNR above chisqr_weighted_snr_lower_cutoff
'''
pycbc_triggers_frame = pycbc_triggers_frame.assign(chisqr_weighted_snr= np.vectorize(self.calculate_chisqr_weighted_snr)(pycbc_triggers_frame.snr, pycbc_triggers_frame.chisq, pycbc_triggers_frame.chisq_dof.astype('float')))
df_new = pycbc_triggers_frame[pycbc_triggers_frame.chisqr_weighted_snr > chisqr_weighted_snr_lower_cutoff]
return df_new
[docs] def pyCBC_query_outlier(self, pycbc_triggers_frame, BinNum, Nsigma, cut):
'''
description:
1. bin the triggers with values of log10 of chi-square per DOF
2. calculate the lower bound of log10 of chi-square per DOF in the bin
3 (cut = 'median') Calculate the median of log10 of SNR and log10 of chi square per degree of freedom in the bin
3.(cut = 'mad') Calculate the upper bound of log10 of SNR in the bin where the lower bound is the median minus the median absolute deviation, the upper bound is the median plus the median absolute deviation
4. Polynomial fit the upper bound of log10 of SNR as a function of the lower bound of log10 of chi-square per DOF
5. split triggers using the polynomial fit to loud and quiet triggers
USAGE: pycbc_loud = pyCBC_query_outlier(pycbc_triggers_frame, BinNum=50, Nsigma=1, cut='median')
:param pycbc_triggers_frame: pycbc triggers in pandas frame
:param BinNum: the number of bins of a histogram for log10 of chi squar per degrees of freedom
:param Nsigma: an integer to determine the upper bound of the quiet triggers
:param cut: a method of cut {median or mad}
:return: pycbc_loud: loud triggers
'''
# log10 of SNR
pycbc_triggers_frame = pycbc_triggers_frame.assign(log_SNR= np.log10(pycbc_triggers_frame.snr))
# log10 of chi-square per its degree of freedom
pycbc_triggers_frame = pycbc_triggers_frame.assign(log_chisqr_per_dof=np.log10(pycbc_triggers_frame.chisq/pycbc_triggers_frame.chisq_dof))
Bins_log_chisqr_per_dof = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(), BinNum, endpoint=True)
if cut =='median':
# ------------------------------------------------------------------------------------+
# important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR |
# ------------------------------------------------------------------------------------+
# a list of medians of log10 of SNR
list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
# a list of medians of log10 of chi-square per its degree of freedom
list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
#-----------------------------------+
# clean up the representative curve |
#-----------------------------------+
# find the index of elements = nan for the median values ----------
list_index_nan_median_log_SNR = np.where(np.isnan(list_median_log_SNR) == True)[0] # for median of log SNR
list_index_nan_median_log_chisqr_per_dof = np.where(np.isnan(list_median_log_chisqr_per_dof) == True)[0] # for median of log of chisqr per DOF
# concatenate the lists of indices and keep non-dupulicated ones
list_drop_index = np.unique(np.concatenate((list_index_nan_median_log_SNR, list_index_nan_median_log_chisqr_per_dof)))
# delete the elements = nan
list_median_log_SNR_cleaned = np.delete(list_median_log_SNR, list_drop_index)
list_median_log_chisqr_per_dof_cleaned = np.delete(list_median_log_chisqr_per_dof, list_drop_index)
# ----------+
# best fit |
# ----------+
# the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x + p[3]
# where x is log 10 of median values of chi square per degrees of freedom
# y is log10 of SNR
p = np.polyfit(list_median_log_chisqr_per_dof_cleaned, list_median_log_SNR_cleaned, 3)
#------------+
# split |
#------------+
# loud pycbc triggers are triggers with SNR above the upper cut defined above
pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
# loud pycbc triggers are triggers with SNR below the upper cut defined above
pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
elif cut =='mad':
# ------------------------------------------------------------------------------------+
# important outlier is lower part of off-diagonal line of chi_square per DQF VS SNR |
# ------------------------------------------------------------------------------------+
# a list of medians of log10 of SNR
list_median_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
# a list of medians of log10 of chi-square per its degree of freedom
list_median_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.median() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
# a list of median absolute deviations of log10 of SNR
list_mad_log_SNR = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_SNR.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
# a list of median absolute deviations of log10 of chi-square per its degree of freedom
list_mad_log_chisqr_per_dof = np.array([pycbc_triggers_frame[(pycbc_triggers_frame.log_chisqr_per_dof >= start_bin) & (pycbc_triggers_frame.log_chisqr_per_dof < end_bin)].log_chisqr_per_dof.mad() for start_bin, end_bin in zip(Bins_log_chisqr_per_dof[:-1], Bins_log_chisqr_per_dof[1:])])
#-----------------------------------+
# clean up the representative curve |
#-----------------------------------+
# find the index of elements = nan or zero for the upper bound
list_index_nan_mad_log_SNR = np.where(np.isnan(list_mad_log_SNR) == True)[0] # nan for median of log SNR
list_index_zero_mad_log_SNR = np.where(list_mad_log_SNR == 0)[0] # zero for median of log SNR
list_index_nan_mad_log_chisqr_per_dof = np.where(np.isnan(list_mad_log_chisqr_per_dof) == True)[0] # nan for median of log of chisqr per DOF
list_index_zero_mad_log_chisqr_per_dof = np.where(list_mad_log_chisqr_per_dof == 0)[0] # nan for median of log of chisqr per DOF
# concatenate the lists of indices and keep non-dupulicated ones
list_drop_index_bd = np.unique(np.concatenate((list_index_nan_mad_log_SNR, list_index_zero_mad_log_SNR, list_index_nan_mad_log_chisqr_per_dof, list_index_zero_mad_log_chisqr_per_dof)))
# delete the elements = nan or zero
list_upper_bd_log_SNR = np.delete(list_median_log_SNR, list_drop_index_bd) + Nsigma * np.delete(list_mad_log_SNR, list_drop_index_bd)
list_lower_bd_log_chisqr_per_dof = np.delete(list_median_log_chisqr_per_dof, list_drop_index_bd) - Nsigma * np.delete(list_mad_log_chisqr_per_dof, list_drop_index_bd)
#----------+
# best fit |
#----------+
# the polynomial curve is y = p[0] * x**3 + p[1] * x**2 + p[2] * x + p[3]
# where x is log 10 of median + Nsigma * median absolute deviation of chi square per degrees of freedom
# y is log10 of SNR
p = np.polyfit(list_lower_bd_log_chisqr_per_dof, list_upper_bd_log_SNR, 3)
#-------+
# split |
#-------+
# loud pycbc triggers are triggers with SNR above the upper cut defined above
pycbc_loud = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR > p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
# loud pycbc triggers are triggers with SNR below the upper cut defined above
pycbc_quite = pycbc_triggers_frame[pycbc_triggers_frame.log_SNR <= p[0] * pycbc_triggers_frame.log_chisqr_per_dof ** 3 + p[1] * pycbc_triggers_frame.log_chisqr_per_dof ** 2 + p[2] * pycbc_triggers_frame.log_chisqr_per_dof ** 1 + p[3]]
# #------------------
# # plot
# #------------------
# import matplotlib
# matplotlib.use('Agg')
# import matplotlib.pyplot as plt
# y = np.linspace(pycbc_triggers_frame.log_chisqr_per_dof.min(), pycbc_triggers_frame.log_chisqr_per_dof.max(),100)
# x_poly = p[0] * y**3 + p[1] * y**2 + p[2] * y**1 + p[3]
# plt.scatter(10**pycbc_loud.log_SNR, 10**pycbc_loud.log_chisqr_per_dof, s=3, color='blue', label='loud trigger')
# plt.scatter(10 ** pycbc_quite.log_SNR, 10 ** pycbc_quite.log_chisqr_per_dof, s=3, color='cornflowerblue',label='quite trigger')
# if cut == 'median':
# plt.scatter(10**list_median_log_SNR_cleaned, 10**list_median_log_chisqr_per_dof_cleaned, marker='x', color='r', label='median')
# plt.plot(10 ** x_poly, 10 ** y, ls='--', color='black', label='poly fit')
# elif cut == 'mad':
# plt.scatter(10 ** list_upper_bd_log_SNR, 10 ** list_lower_bd_log_chisqr_per_dof, marker='x', color='magenta',label='ub')
# plt.plot(10 ** x_poly, 10 ** y, ls='-', color='black', label='poly fit ub')
# plt.yscale('log')
# plt.xscale('log')
# plt.xlabel('SNR')
# plt.ylabel('Chi-square per DOF')
# plt.legend()
# plt.savefig('pycbc.png')
# plt.close()
return pycbc_loud
[docs] def pycbc_clustering_window_around_trigger(self, pycbc_trigger, IFO, one_sided_window, extension_duration):
'''
description:
This is a clustering filter
1. take a frame of pycbc triggers
2. pick up a trigger with highest SNR in a window around a trigger. The window size is twice of "one_sided_window"
3. create new columns required to run this code
USAGE: pycbc_trigger = pycbc_clustering_window_around_trigger(pycbc_trigger, IFO, one_sided_window=0.1, extension_duration=1.5)
:param pycbc_trigger: pycbc triggers in pandas frame format
:param IFO: ifo
:param one_sided_window: one sided window in seconds around a trigger for clustering,
:param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger
:return: pycbc_trigger: clustered pycbc triggers
'''
pycbc_trigger = pycbc_trigger.sort_values(by=['end_time'])
pycbc_trigger = pycbc_trigger.reset_index(drop=True)
for index in pycbc_trigger.index: # iterate sub-samples
try: # most of pycbc triggers are overlapped for a same burst, I remove redundant triggers
#one_sided_window = 0.1 # one sided time window
start_window = float(pycbc_trigger.loc[index]['end_time'] - one_sided_window) # starting time of the window for a trigger
end_window = float(pycbc_trigger.loc[index]['end_time'] + one_sided_window) # ending time of the window for a trigger
df_overlapped = pycbc_trigger[(start_window <= pycbc_trigger['end_time']) & (pycbc_trigger['end_time'] <= end_window)] # triggers overlapping this trigger
max_snr = df_overlapped['snr'].max() # a trigger with a maximum trigger during this window
event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0] # index for the trigger with the maximum SNR
all_index = np.array(df_overlapped.index.tolist()) # a list of index of the overlapping triggers
redundant_indecies = all_index[all_index != event_ind] # a list of index of the triggers that are redundant
pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0) # drop the redundant triggers
# saved_index = np.append(saved_index, event_ind) # save a trigger with highest SNR
except KeyError: # if this trigger is already removed, go to the next iteration
continue
#########################
# get the meta data
pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time'])
pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration)
pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']])
pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0])
return pycbc_trigger
[docs] def pycbc_clustering_timeslice(self, pycbc_trigger, IFO, startT, endT, window, extension_duration):
'''
description:
This is a clustering filter
1. take a frame of pycbc triggers
2. pick a trigger with highest SNR in a window that is a time-sliced bin
3. create new columns required to run this code
USAGE: pycbc_trigger = pycbc_clustering_timeslice(pycbc_trigger, IFO, startT, endT, window=0.1, extension_duration=1.5)
:param pycbc_trigger: pycbc triggers in pandas frame
:param IFO: ifo
:param startT: start time of an epoch
:param endT: end time of an epoch
:param window: a window length in sec for clustering
:param extension_duration: factor for extending the template duration, e.g., extension_duration = 1.5 makes the duration of the on-source window 1.5 times longer than that of the trigger
:return: pycbc_trigger: clustered pycbc triggers
'''
pycbc_trigger = pycbc_trigger.sort_values(by=['end_time'])
pycbc_trigger = pycbc_trigger.reset_index(drop=True)
time_bins = np.arange(startT, endT, window)
for start_bin in time_bins:
df_overlapped = pycbc_trigger[(pycbc_trigger.end_time >= start_bin) & (pycbc_trigger.end_time < start_bin + window)]
if df_overlapped.shape[0] != 0:
max_snr = df_overlapped['snr'].max() # a trigger with a maximum trigger during this window
event_ind = df_overlapped[df_overlapped['snr'] == max_snr]['end_time'].index[0] # index for the trigger with the maximum SNR
all_index = np.array(df_overlapped.index.tolist()) # a list of index of the overlapping triggers
redundant_indecies = all_index[all_index != event_ind] # a list of index of the triggers that are redundant
pycbc_trigger = pycbc_trigger.drop(redundant_indecies, axis=0) # drop the redundant triggers
else:
continue
# get the meta data
pycbc_trigger = pycbc_trigger.assign(GPStime=pycbc_trigger['end_time'])
pycbc_trigger = pycbc_trigger.assign(duration=pycbc_trigger['template_duration'] * extension_duration)
pycbc_trigger = pycbc_trigger.assign(ifo=[IFO] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(id=['TP{}'.format(i) for i in pycbc_trigger['template_id']])
pycbc_trigger = pycbc_trigger.assign(label=['arbitrary'] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(imgUrl=[None] * pycbc_trigger.shape[0])
pycbc_trigger = pycbc_trigger.assign(confidence=[0] * pycbc_trigger.shape[0])
return pycbc_trigger
#return df_loud, df_quiet
[docs] def FindGlitchNearestGPS(self, PathMetadataFile, candidate_GPS):
'''
description:
1. take a omicron trigger meta data file
2. find a glitch nearest to an input GPS, which you are concerned about
3. replace the label of this glitch from 'arbitrary' to 'candidate'
This function is assumed to be used in DQR.
Once GraceDB provides a GPS time, this function labels a glitch nearest glitch given by Omicron Trigger as it can be considered as the most significant candidate of astronomical event.
In this way, we can study only the candidate glitch by specifying glitch_type = candidate in a configuration file
USAGE: largetSNR = FindGlitchNearestGPS(PathMetadataFile, GPS)
:param PathMetadataFile: a path to omicron meta data file
:param GPS: a GPS time to be concerned
:return: None
'''
df = pd.read_csv(PathMetadataFile)
start = candidate_GPS - 0.5 # start time of the target window, which is 0.5 sec before a candidate event
end = candidate_GPS + 0.5 # end time of the target window, which is 0.5 sec afte ra candidate event
# omicron triggers within the time window
df_window = df[(df['GPStime'] > start) & (df['GPStime'] < end)]
largetSNR = df_window['snr'].max()
largestglitchIndex = df_window[df_window['snr'] == largetSNR].index[0]
# replace the label of this glitch from 'arbitrary' to 'candidate'
df.loc[largestglitchIndex, 'label'] = 'candidate'
df.to_csv(PathMetadataFile)
return largetSNR
#+++++++++++++++++++++++++++++++++++++
# find the allowed segments
#++++++++++++++++++++++++++++++++++++++
[docs]def Findglitchlist(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, UpperDurationThresh, LowerDurationThresh, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'):
'''
description:
1. load Gravity Spy data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. accept glitches where their back ground segment do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
:param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches
:param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
:param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
#### improvement for duration 2019-04-05 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
FractionEnergy = 0.9 # (90% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
#df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
df_int = df_int.assign(gap=[gap]*df_int.shape[0])
df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90
# make the time segments of glitches measured
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# redefine only the duration of the target glitch
df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
else:
pass
df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo']==IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
# the starting and ending time of the target glitch
target_timeseries_start = gpstime - duration * position_duration_bfr_centr
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration
# security check of the preceding background
# case 1 is where the background is coincident with the right edge of the reserved segment
# case 2 is where the background is coincident with the left edge of the reserved segment
# case 3 is where the reserved segment is inside of the background
# case 4 is where the background is inside of the reserved segment
# it violates the security of the background estimation if at least one of the cases occurs
# i.e., pro_case1, 2, 3, and 4 should be zero
pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1
pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2
pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3
pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)]) # case 4
# security check of the following background
fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1
fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2
fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3
fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)]) # case 4
if flag == 'Both': # take both preceding and following back gounds
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
continue
if flag == 'Either': # accept if either a preceding or a following background is available
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where only a proceeding is available
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(0) # fill zero
Listfol_background_end.append(0) # fill zero
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(0) # fill zero
Listpre_background_end.append(0) # fill zero
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where both are NOT available
print('Both a preceding and following are NOT available')
continue
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
#=======================================
[docs]def FindglitchlistLongestBG(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
'''
description:
1. load Gravity Spy data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. accept glitches where their back ground segment do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
:param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches
:param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
:param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
#### improvement for duration 2019-04-05 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
FractionEnergy = 1. # (90% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
# df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
df_int = df_int.assign(gap=[gap] * df_int.shape[0])
df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90
# make the target segments
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# redefine only the duration of the target glitch
df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
else:
pass
df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])
# reserving_time does not include gap
#df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'])
#df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'])
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
# for cWB triggers
#df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)]
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
# df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# add central frequency cuts
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.02 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
target_timeseries_start = gpstime - duration * position_duration_bfr_centr
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration
# security check of the preceding background
# case 1 is where the background is coincident with the right edge of the reserved segment
# case 2 is where the background is coincident with the left edge of the reserved segment
# case 3 is where the reserved segment is inside of the background
# case 4 is where the background is inside of the reserved segment
# it violates the security of the background estimation if at least one of the cases occurs
# i.e., pro_case1, 2, 3, and 4 should be zero
pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 1
pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_start'] < pre_background_end)]) # case 2
pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (df_int['reserved_time_end'] < pre_background_end)]) # case 3
pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (df_int['reserved_time_end'] > pre_background_end)]) # case 4
# security check of the following background
fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 1
fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_start'] < fol_background_end)]) # case 2
fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (df_int['reserved_time_end'] < fol_background_end)]) # case 3
fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (df_int['reserved_time_end'] > fol_background_end)]) # case 4
if flag == 'Both': # take both preceding and following back gounds
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 time of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 time of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
# #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
continue
if flag == 'Either': # accept if either a preceding or a following background is available
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where only a preceding is available
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, 0, duration))
if round(pre_background_end - pre_background_start, 3) < round(duration, 3) :
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(0) # fill zero
Listfol_background_end.append(0) # fill zero
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
print('{:.1f}, {:.1f}, {:.1f}'.format(0, fol_background_end - fol_background_start, duration))
if round(fol_background_end - fol_background_start, 3) < round(duration, 3):
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(0) # fill zero
Listpre_background_end.append(0) # fill zero
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where both are NOT available
print('Both a preceding and following are NOT available')
continue
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
[docs]def Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
'''
description:
1. load Gravity Spy data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. accept glitches where their back ground segment do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = Findglitchlist_for_timeseries_analysis(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
:param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches
:param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
:param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
#### improvement for duration 2019-04-05 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
FractionEnergy = 1. # (90% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
# df_int = df_int.assign(gap = gap + (1 - FractionEnergy)/2.*df_int['duration'])
df_int = df_int.assign(gap=[gap] * df_int.shape[0])
df_int['duration'] = np.round(FractionEnergy*df_int['duration'], 3) # replace full durations (T100) with T90
# make the target segments
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# redefine only the duration of the target glitch
df_int.loc[(df_int['ifo']==IFO) & (df_int['label']==TargetGlitchClass), ['duration']] = UserDefinedDuration
else:
pass
df_int = df_int.assign(reserved_time_start = df_int['GPStime'] - position_duration_bfr_centr * df_int['duration'] - df_int['gap'])
df_int = df_int.assign(reserved_time_end = df_int['GPStime'] + (1 - position_duration_bfr_centr) * df_int['duration'] + df_int['gap'])
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] >= BGSNR_thre] # I think this is ok 2018-12-15
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
# for cWB triggers
#df_glitch = df_int[(df_int.rho >= 10) & (df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff)]
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
df_glitch = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.02 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
target_timeseries_start = gpstime - duration * position_duration_bfr_centr
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = 15*duration # duration of both the proceeding and following background
if background_duration > 15: # reduce computational cost
background_duration = 15
else:
pass
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration #target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration #target_timeseries_end + gap_individual + background_duration
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
[docs]def FindRadomlistPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target):
'''
description:
1. within an epoch, create a list of synthetic points with randomly chosen with durations following a distribution of that of a target glitch class
2. make pandas frame dataset
USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step)
:param state: IFO state {observing, nominal-lock}
:param IFO: an observer {H1, L1}
:param Epoch_lt: a list of epochs
:param number_samples: number of samples picked up
:param step: step of data points in sec
:param outputMother_dir: an output directory in witch the data set is in
:param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background
:return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch
'''
# -----------------------------------------------------------------------------------------------------+
# get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} |
# use IdentifyGlitch.FindObservingTimeSegments() |
# -----------------------------------------------------------------------------------------------------+
# get a glitch identifier object
indentify = IdentifyGlitch()
# get start time and end time of an epoch
startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
# get observing segments
SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
if state == 'nominal-lcok':
SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
elif state == 'observing':
pass
total_seg_used = (SegmentsMat[:,1] - SegmentsMat[:,0]).sum()
if number_samples > int(total_seg_used/float(step)): # the number of samples greater than the maximum number of possible samples
number_samples = int(total_seg_used/float(step)) # set number of samples equal to the maximum number of possible samples
print('User selected number of synthetic samples is greater than the maximum number of possible samples')
print('Set the number of samples equal to the maximum number = {}'.format(number_samples))
#-----------------------------------------------------+
# generate synthetic list of center times of samples |
#-----------------------------------------------------+
whole_list_center_times = np.array([])
for start_seg_used, end_seg_used in SegmentsMat:
length_seg_used = end_seg_used - start_seg_used # length of a segment in sec
number_picked = int(number_samples * length_seg_used / float(total_seg_used)) # number of points to be selected in this segment
# chop this segment by a given time step, then select "number_picked" points out of "length_seg_used/step" points, without duplication
selected_index_center_times = np.random.choice(int(length_seg_used/float(step)), number_picked+1, replace=False) # 1 comes to pick samples a little more than the request at this moment as the sample size might be smaller than than the setting
# calculate a list of center times
list_center_times = np.array([start_seg_used + step * selected_index_center_time for selected_index_center_time in selected_index_center_times])
# sort this list in ascendingly
list_center_times = np.sort(list_center_times)
# append this list of center times to the whole list
whole_list_center_times = np.append(whole_list_center_times, list_center_times)
# whole_list_center_times = whole_list_center_times[:number_samples] # make the sample size equal to the request (take the first chunk)
# randomly choose the timestamps where the size of the sample equal to the request
whole_list_center_times = np.random.choice(whole_list_center_times, number_samples, replace=False)
# ascending order
whole_list_center_times = np.sort(whole_list_center_times)
#--------------------------------------+
# generate synthetic list of durations |
#--------------------------------------+
duration_min = 0.02 # the minimum duration is set to 0.02 s. This is ok as in this program analysis glitches with duration gerater than 0.02 s
duration_bin_width = 0.01 # the bin width of duration is set to 0.01
Bins = int((df_target.duration.max() - duration_min) / duration_bin_width) # the number of bins such that the bin width = 0.01
# make a normalized histogram of durations
Count, Edge = np.histogram(df_target[(df_target.duration > duration_min) & (df_target.duration <= df_target.duration.max())].duration, bins=Bins, density=True)
d_Edge = Edge[1] - Edge[0] # bin width
# In gereral, the distribuiton of duration has long tail in a large value
# make the sum of area of distribution equal to 1 exactly to generate synthetic set of durations
Count[-1] = Count[-1] - (1 - np.sum(Count * d_Edge)) # correct the last element of this distribution
Center_bin = Edge[:-1] + d_Edge * 0.5 # center values of each bin
# if some of the element shows negative like -1.11022302e-16
Count = np.where(Count >= 0, Count, 0)
# generate a synthetic set of durations from the distribution that real durations
gen_d = np.random.choice(Center_bin, number_samples, p=Count*d_Edge)
# make a pandas dataframe that is compatible with a standard format in my program
df = pd.DataFrame()
df = df.assign(GPStime=whole_list_center_times)
df = df.assign(startGPStime=np.zeros(number_samples))
df = df.assign(peakFreq=np.zeros(number_samples))
df = df.assign(snr=np.zeros(number_samples))
df = df.assign(amplitude=np.zeros(number_samples))
df = df.assign(centralFreq=np.zeros(number_samples))
df = df.assign(duration=gen_d)
df = df.assign(bandwidth=np.zeros(number_samples))
df = df.assign(chisq=np.zeros(number_samples))
df = df.assign(chisqDof=np.zeros(number_samples))
df = df.assign(confidence=np.zeros(number_samples))
df = df.assign(id=[None] * number_samples)
df = df.assign(ifo=['{}'.format(IFO)] * number_samples)
df = df.assign(label=['null'] * number_samples)
df = df.assign(imgUrl=[None] * number_samples)
output_path = os.path.join(trigger_dir, 'RandomSample.csv')
df.to_csv(output_path) # save the data sample
df = pd.read_csv(output_path) # load the file
df.loc[:, 'label'] = 'null' # this is py3 issue, the label of 'null' is read as NaN
return df
[docs]def FindShiftedPoints(state, IFO, Epoch_lt, number_samples, step, outputMother_dir, df_target):
'''
description:
1. within an epoch, create a list of synthetic points by shifting a target glitch class
2. make pandas frame dataset
USAGE: df = FindRadomlistPoints(state, Epoch_lt, number_trials, step)
:param state: IFO state {observing, nominal-lock}
:param IFO: an observer {H1, L1}
:param Epoch_lt: a list of epochs
:param number_samples: number of samples picked up
:param step: step of data points in sec
:param outputMother_dir: an output directory in witch the data set is in
:param df_target: true glitch samples generated by an ETG with SNR above an upper threshold of background
:return: df: synthetic random data points within an epoch with durations generated from a distribution of a target glitch
'''
# -----------------------------------------------------------------------------------------------------+
# get a list of start and end GPS times of an epoch with a given state {'observing', 'nominal-lock'} |
# use IdentifyGlitch.FindObservingTimeSegments() |
# -----------------------------------------------------------------------------------------------------+
# get a glitch identifier object
indentify = IdentifyGlitch()
# get start time and end time of an epoch
startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
# get observing segments
SegmentsMat, trigger_dir = indentify.FindObservingTimeSegments(IFO, startT, endT, outputMother_dir, state)
if state == 'nominal-lcok':
SegmentsMat[:,0] = SegmentsMat[:, 0] + 300 # remove 5 min after the observing mode begins
SegmentsMat[:,1] = SegmentsMat[:, 1] - 60 # remove 1 min before the observing mode ends
elif state == 'observing':
pass
df_target = df_target.assign(GPStime=df_target.GPStime.values + 10) # intentionally shift the trigger times by 10 s
df_target = df_target.assign(snr=[0]*df_target.shape[0])
df_target = df_target.assign(label=['null'] * df_target.shape[0])
df_target = df_target.assign(id=[None] * df_target.shape[0])
df_target = df_target.assign(imgUrl=[None] * df_target.shape[0])
df_list = []
for Seg_start, Seg_end in SegmentsMat:
df_list.append(df_target[(df_target.GPStime >= Seg_start) & (df_target.GPStime <= Seg_end)])
df = pd.concat(df_list)
df = df.reset_index(drop=True)
output_path = os.path.join(trigger_dir, 'RandomSample.csv')
df.to_csv(output_path) # save the data sample
df = pd.read_csv(output_path) # load the file
return df
[docs]def FindBGlist(state, number_trials, step, outputMother_dir, df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf, flag='Both'):
'''
description:
1. load Gravity Spy data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. accept glitches where their back ground segment do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UpperDurationThresh, LowerDurationThresh, UserDefinedDuration, gap, flag)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UpperDurationThresh: an upper bound of duration in sec (float or int)
:param LowerDurationThresh: a lower bound of duration in sec (float or int)
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
df_list = []
# take the data in an Epoch
for Epoch in Epoch_lt:
StartT, EndT = Epoch[0], Epoch[1]
df_epoch = df[(df['GPStime'] >= StartT) & (df['GPStime'] <= EndT)]
df_list.append(df_epoch)
df_int = pd.concat(df_list)
# data set containing only a target glitch class
try: # gravity spy and omicron trigger mode
df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['peakFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['peakFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
except KeyError: # pyCBC mode as there is no column "peakFreq"
df_target = df_int[(df_int['label'] == TargetGlitchClass) & (df_int['ifo'] == IFO) & (df_int['confidence'] >= Confidence_thre) & (df_int['snr'] >= targetSNR_thre) & (df_int['snr'] <= targetUpperSNR_thre) & (df_int['centralFreq'] >= TriggerPeakFreqLowerCutoff) & (df_int['centralFreq'] <= TriggerPeakFreqUpperCutoff) & (df_int['duration'] >= LowerDurationThresh) & (df_int['duration'] <= UpperDurationThresh)]
# generate synthetic data points
df_random = FindRadomlistPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])
#df_random = FindShiftedPoints(state, IFO, Epoch_lt, number_trials, step, outputMother_dir, df_target[df_target.snr > BGSNR_thre])
df_int = df_int.append(df_random, ignore_index=True) # append the dataset of random points
#### improvement for duration 2019-07-02 #############
#### full durations are too long for capturing the glitch signature so that the solution is to choose T90
### In this situation, a gap that is manually chosen will be gap_new = gap_m + 0.05*T100
### This function is for getting quite segments so that I concervatively choose the longest fraction of duration, in default
FractionEnergy = 1 # (100% of energy )
# note (1 - FractionEnergy)/2. is the fraction of the one-sided off T90
df_int = df_int.assign(gap=gap + (1 - FractionEnergy) / 2. * df_int['duration'])
df_int['duration'] = np.round(FractionEnergy * df_int['duration'], 3) # replace full durations (T100) with T90
# make the time segments of glitches measured
# I reserve duration +/- gap around a GPS time of a glitch
if UserDefinedDuration > 0: # accept a user defined duration
# assign this user defined duration for any types of glitches observed in a given IFO
df_int.loc[(df_int['ifo'] == IFO), ['duration']] = UserDefinedDuration
else: # otherwise, keep the duration that are determined by an event trigger generator (ETG)
pass
df_int = df_int.assign(reserved_time_start=df_int['GPStime'] - df_int['duration'] / 2. - df_int['gap'])
df_int = df_int.assign(reserved_time_end=df_int['GPStime'] + df_int['duration'] / 2. + df_int['gap'])
# data set of randomly chosen background
df_glitch = df_int[df_int.label == 'null'] # synthetic background trigger has SNR of zero
# exclude glitches with SNR <= BGSNR from df_int as they are quiet enough to be considered as background
df_int = df_int[df_int['snr'] > BGSNR_thre]
print('all the glitches in the class: {}'.format(df_glitch.shape[0]))
# the list of values satisfied as secure
ListIndexSatisfied = []
Listtarget_timeseries_start = []
Listtarget_timeseries_end = []
Listpre_background_start = []
Listpre_background_end = []
Listfol_background_start = []
Listfol_background_end = []
Listgpstime = []
Listduration = []
ListSNR = []
Listconfi = []
ListID = []
for ind in range(len(df_glitch)):
# target glitch's peak GPS time and the duration
gpstime = df_glitch['GPStime'].iloc[ind] # a peak GPS time
duration = df_glitch['duration'].iloc[ind] # # a duration
SNR = df_glitch['snr'].iloc[ind] # snr
confi = df_glitch['confidence'].iloc[ind] # confidence level
ID = df_glitch['id'].iloc[ind] # ID
gap_individual = df_glitch['gap'].iloc[ind]
if duration > 0.02: # accept the duration of glitch is greater than 0.02 sec
pass # go to the next line
else: # if the duration of a glitch is less than 0.01 sec
continue # go to the next iteration
# the starting and ending time of the target glitch
# at this moment I assume that the target glitch is distributed evenly around the GPS time
target_timeseries_start = gpstime - duration / 2.
target_timeseries_end = target_timeseries_start + duration
# take the background
# the background is around the target glitch
# preceding background, which is a nearest neighbor segment before target_timeseries_start
# following background, which is a nearest neighbor segment after target_timeseries_end
background_duration = duration # duration of both the proceeding and following background
pre_background_start = target_timeseries_start - gap_individual - background_duration
pre_background_end = pre_background_start + background_duration # target_timeseries_start - gap_individual
fol_background_start = target_timeseries_end + gap_individual
fol_background_end = fol_background_start + background_duration # target_timeseries_end + gap_individual + background_duration
# security check of the preceding background
# case 1 is where the background is coincident with the right edge of the reserved segment
# case 2 is where the background is coincident with the left edge of the reserved segment
# case 3 is where the reserved segment is inside of the background
# case 4 is where the background is inside of the reserved segment
# it violates the security of the background estimation if at least one of the cases occurs
# i.e., pro_case1, 2, 3, and 4 should be zero
pre_case1 = len(df_int[(df_int['reserved_time_end'] > pre_background_start) & (
df_int['reserved_time_end'] < pre_background_end)]) # case 1
pre_case2 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (
df_int['reserved_time_start'] < pre_background_end)]) # case 2
pre_case3 = len(df_int[(df_int['reserved_time_start'] > pre_background_start) & (
df_int['reserved_time_end'] < pre_background_end)]) # case 3
pre_case4 = len(df_int[(df_int['reserved_time_start'] < pre_background_start) & (
df_int['reserved_time_end'] > pre_background_end)]) # case 4
# security check of the following background
fol_case1 = len(df_int[(df_int['reserved_time_end'] > fol_background_start) & (
df_int['reserved_time_end'] < fol_background_end)]) # case 1
fol_case2 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (
df_int['reserved_time_start'] < fol_background_end)]) # case 2
fol_case3 = len(df_int[(df_int['reserved_time_start'] > fol_background_start) & (
df_int['reserved_time_end'] < fol_background_end)]) # case 3
fol_case4 = len(df_int[(df_int['reserved_time_start'] < fol_background_start) & (
df_int['reserved_time_end'] > fol_background_end)]) # case 4
if flag == 'Both': # take both preceding and following back grounds
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0 and fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0:
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start -
df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 time of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 time of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
# #print('{:.1f}, {:.1f}, {:.1f}'.format(neareset_gap_pre, neareset_gap_fol, duration))
print('{:.1f}, {:.1f}, {:.1f}'.format(pre_background_end - pre_background_start, fol_background_end - fol_background_start, duration))
if round(pre_background_end - pre_background_start, 3) < round(duration, 3) or round(fol_background_end - fol_background_start, 3) < round(duration, 3):
sys.exit()
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
continue
if flag == 'Either': # accept if either a preceding or a following background is available
if pre_case1 == 0 and pre_case2 == 0 and pre_case3 == 0 and pre_case4 == 0:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where both a proceeding and a following are available
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where only a proceeding is available
try:
# smallest time gap between the start of a target reserved time and the nearest reserved time of others that are before the target
neareset_gap_pre = (pre_background_start - df_int[df_int['reserved_time_end'] < pre_background_start]['reserved_time_end']).sort_values()[:1].values[0]
if neareset_gap_pre >= duration: # if the smallest gap is larger than the duration
if neareset_gap_pre > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_pre = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
pre_background_start = pre_background_end - neareset_gap_pre # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(pre_background_start)
Listpre_background_end.append(pre_background_end)
Listfol_background_start.append(0) # fill zero
Listfol_background_end.append(0) # fill zero
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else:
if fol_case1 == 0 and fol_case2 == 0 and fol_case3 == 0 and fol_case4 == 0: # the case where only a following is available
try:
# smallest time gap between the end of a target reserved time and the nearest reserved time of others that are after the target
neareset_gap_fol = (df_int[df_int['reserved_time_start'] > fol_background_end]['reserved_time_start'] - fol_background_end).sort_values()[:1].values[0]
if neareset_gap_fol >= duration: # if the smallest gap is larger than the duration
if neareset_gap_fol > 10 * duration: # if the smaller gap is 10 times longer than the duration to make the program faster and get the best estimate of adjacsent continuous noise
neareset_gap_fol = 10 * duration # replace it with the 10 times of the duration
else:
pass
if duration < 10: # if on-source window is less than 10 sec, it can make the off-source window longer, otherwise, it is computatinally very expensive
fol_background_end = fol_background_start + neareset_gap_fol # replace
else:
pass
except IndexError: # there is no nearest glitch that satisfies the above condition
pass
ListIndexSatisfied.append(ind) # append the index satisfied as secure
Listtarget_timeseries_start.append(target_timeseries_start)
Listtarget_timeseries_end.append(target_timeseries_end)
Listpre_background_start.append(0) # fill zero
Listpre_background_end.append(0) # fill zero
Listfol_background_start.append(fol_background_start)
Listfol_background_end.append(fol_background_end)
Listgpstime.append(gpstime)
Listduration.append(duration)
ListSNR.append(SNR)
Listconfi.append(confi)
ListID.append(ID)
else: # the case where both are NOT available
print('Both a preceding and following are NOT available')
continue
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
[docs]def TimeShiftingSamplePrecedingBGonly(df, Epoch_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, position_duration_bfr_centr, TriggerPeakFreqLowerCutoff=0, TriggerPeakFreqUpperCutoff=8192, targetUpperSNR_thre=np.inf,flag='Both'):
'''
description:
1. load Gravity Spy data set (.csv file)
2. get the data about the target glitch class
3. get the subset of the target glitches based on SNR and confidence level threshold a user defines
4. accept glitches where their back ground segment do not coincide with any other glitches
5. return the info of the accepted glitches
USAGE: Listsegments = FindglitchlistOnLineMode(df, Epochstart, Epochend, Commissioning_lt, TargetGlitchClass, IFO, BGSNR_thre, targetSNR_thre, Confidence_thre, UserDefinedDuration, gap, flag)
:param df: GravitySpy meta data in pandas format
:param Epochstart: starting time of an epoch
:param Epochend: end time of an epoch
:param Commissioning_lt: commissioning time in list
:param TargetGlitchClass: a target glitch class name (str)
:param IFO: a type of interferometer (H1, L1, V1) (str)
:param BGSNR_thre: an upper threhold of SNR for background glitches, i.e., quiet enough ), float or int
:param targetSNR_thre: a lower threshold of SNR for target glitches, float or int
:param Confidence_thre: a threshold of confidence level (float or int )
:param UserDefinedDuration: user defined duration of a glitch (float or int), 0 in default
:param gap: a time gap between the target and the background segments in sec, 1 sec in default
:param position_duration_bfr_centr: proportion of duration for a target segment around a center time, e.g, 0.5 indicates duration is even distributed around a center time, 0.83 indicates 5/6 is before the center time
:param TriggerPeakFreqLowerCutoff: a lower limit cutoff value of the peak frequency of triggers given by an ETG for target glitches
:param TriggerPeakFreqUpperCutoff: an upper limit cutoff value of the peak frequency of triggers given by an ETG queries for target glitches
:param targetUpperSNR_thre: an upper limit cutoff value of SNR of triggers given by an ETG queries for target glitches
:param flag: 'Both' or 'Either' taking both backgronds or either the preceding or the following background, respectively to accept glitches
:return: the list of parameters of glitches passing the above thresholds
Listsegments contains of
ListIndexSatisfied: a list of index of glitches
Listtarget_timeseries_start: a list of a target glitch starting time
Listtarget_timeseries_end: a list of a target glitch ending time
Listpre_background_start: a list of preceding background starting time
Listpre_background_end; a list of preceding background ending time
Listfol_background_start: a list of following background starting time
Listfol_background_end: a list of following background ending time
Listgpstime: a list of GPS times
Listduration: a list of durations
ListSNR: a list of SNRs
Listconfi: a list of confidence levels
ListID: a list of IDs
'''
# get start time and end time of an epoch
startT, endT = Epoch_lt[0][0], Epoch_lt[-1][-1]
Number_samples = int(endT - startT)
pre_background_start_initial = startT
Target_duration = 1
BG_duration = 10
pre_background_end_initial = pre_background_start_initial + BG_duration
target_timeseries_start_initial = pre_background_end_initial
# the list of values satisfied as secure
ListIndexSatisfied = range(Number_samples)
Listtarget_timeseries_start = [target_timeseries_start_initial + Target_duration * ind for ind in ListIndexSatisfied]
Listtarget_timeseries_end = [target_timeseries_start + Target_duration for target_timeseries_start in Listtarget_timeseries_start]
Listpre_background_start = [pre_background_start_initial + Target_duration * ind for ind in ListIndexSatisfied]
Listpre_background_end = [pre_background_start + BG_duration for pre_background_start in Listpre_background_start]
Listfol_background_start = [0 for ind in ListIndexSatisfied]
Listfol_background_end = [0 for ind in ListIndexSatisfied]
Listgpstime = [target_timeseries_start + 0.5 * Target_duration for target_timeseries_start in Listtarget_timeseries_start]
Listduration = [1 for ind in ListIndexSatisfied]
ListSNR = [10 for ind in ListIndexSatisfied]
Listconfi = [0 for ind in ListIndexSatisfied]
ListID = ['None' for ind in ListIndexSatisfied]
# zipped list of segments
Listsegments = list(
zip(ListIndexSatisfied, Listtarget_timeseries_start, Listtarget_timeseries_end, Listpre_background_start,
Listpre_background_end, Listfol_background_start, Listfol_background_end, Listgpstime, Listduration,
ListSNR, Listconfi, ListID))
return Listsegments
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#take time series of safe channels and calculate the absolute values of whitened FFT
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
[docs]def select_some_trials(iterate, maximum_iterate=5):
'''
description:
this function is used to reduce the number of trials for the off-source window
USAGE: list_index_trials = select_some_trials(iterate, maximum_iterate=5)
:param iterate: a number of trials
:param maximum_iterate: a maximum number of trials
:return: a randomly chosen trials where the total number is maximum_iteration or less
'''
if iterate > maximum_iterate:
list_index_trials = np.random.choice(range(iterate), maximum_iterate, replace=False)
else:
list_index_trials = np.arange(iterate).astype(int)
return list_index_trials
[docs]def Multiprocess_whitening(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end):
'''
description:
This is used for multi processing for whitening segments
:param full_timeseries: time series comprising target and BGs
:param target_timeseries_start: a start time of a target segment
:param target_timeseries_end: an end time of a target segment
:param pre_background_start: a start time of a preceding BG
:param pre_background_start: an end time of a preceding BG
:param pre_background_start: a start time of a following BG
:param pre_background_start: an end time of a following BG
:return: whitened_fft_target: whitened fft of a target segment
whitened_fft_PBG: whitened fft of a preceding segment
whitened_fft_FBG: whitened fft of a following segment
sample_rate: sampling rate of this channel
DURATION: a duration of a target segment
'''
sample_rate = full_timeseries.sample_rate.value
DURATION = target_timeseries_end - target_timeseries_start
fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number
DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# revise the duration to make an error in whitening process not not to occur
# ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point
elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
pass # keep the original duration as it is
DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
pre_background_duration = pre_background_end - pre_background_start
fol_background_duration = fol_background_end - pre_background_end
pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration)
fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration)
full_timeseries_start = full_timeseries.span[0]
full_timeseries_end = full_timeseries.span[1]
del pre_background_duration, fol_background_duration
if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available
# retrieve the preceding and following backgrounds
if pre_background_start < full_timeseries_start: # if the start time of the preceding BG is before that of the full time series
pre_background = full_timeseries.crop(end=pre_background_end) # only specify the end time of cropping
else: # otherwise
pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end) # specify both
if fol_background_end > full_timeseries_end: # if the end time of the following BG is after that of the full time series
fol_background = full_timeseries.crop(start=fol_background_start) # specify only the stat time of cropping
else: # otherwise
fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end) # specify both
# average background ASD
try:
pre_background_asd = pre_background.asd(DURATION, method='welch') # ASD of the preceding BG
fol_background_asd = fol_background.asd(DURATION, method='welch') # ASD of the following BG
except ValueError:
raise ValueError
ave_background_asd = pre_background_asd_weight*pre_background_asd + fol_background_asd_weight*fol_background_asd # weighed average of those two BG ASD
ave_background_asd = ave_background_asd.value # convert numpy array
del pre_background_asd_weight, fol_background_asd_weight
# for the preceding BG
if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
# number of BG trials
iterate = int(pre_background.duration.value / DURATION)
# redefine the start and end times of the preceding BG to avoid cropping warning
pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1]
## --------cropping from the starting time------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
# avoid the cropping warning
crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
# note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
# calculate each whitened FFT
whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list # delete redundant lists
whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
del whitened_fft_PBG_ind_list # delete redundant a list
## -----cropping from the ending time------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
# avoid the cropping warning
crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
# note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list
whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
del whitened_fft_PBG_ind_list # delete redundant a list
whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding)
del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding # delete redundant a list
else: # if the preceding BG duration is the same as that of the target segment
fft_PBG = pre_background.asd(DURATION, method='welch').value
# whitening the preceding BG using the average BG
whitened_fft_PBG = fft_PBG / ave_background_asd
# for the following BG
if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment
# number of BG trials
iterate = int(fol_background.duration.value / DURATION)
# redefine the start and end times of the following BG to avoid cropping warning
fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1]
## ------ cropping from the starting time ------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
# avoid the cropping warning
crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
# note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
# calculate each whitened FFT
whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list # delete redundant lists
whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
del whitened_fft_FBG_ind_list # delete redundant a list
## ------ cropping from the ending time -----
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
# avoid the cropping warning
crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
# note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list
whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
del whitened_fft_FBG_ind_list # delete redundant a list
whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding)
del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding # delete redundant a list
else: # if the following BG duration is the same as that of the target segment
fft_FBG = fol_background.asd(DURATION, method='welch').value
# whitening the following BG using the average BG
whitened_fft_FBG = fft_FBG / ave_background_asd
elif pre_background_start != 0 and fol_background_start == 0: # if there is an available preceding background
# retrieve the preceding and following backgrounds
if pre_background_start < full_timeseries_start:
pre_background_start = full_timeseries_start
pre_background = full_timeseries.crop(start=pre_background_start, end=pre_background_end)
# fft of a preceding BG
try:
pre_background_asd = pre_background.asd(DURATION) # ASD of the preceding BG
except ValueError:
raise ValueError
ave_background_asd = pre_background_asd # average of those two BG ASD
ave_background_asd = ave_background_asd.value # convert it to numpy array
# for the preceding BG
if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
# number of BG trials
iterate = int(pre_background.duration.value / DURATION)
# redefine the start and end times of the preceding BG to avoid cropping warning
pre_background_start, pre_background_end = pre_background.span[0], pre_background.span[1]
## ----- cropping from the starting time -----
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_start_list = [pre_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
# avoid the cropping warning
crop_end_list = [crop_end if crop_end < pre_background_end else pre_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
# note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
# calculate each whitened FFT
whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list # delete redundant lists
whitened_fft_PBG_forwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
del whitened_fft_PBG_ind_list # delete redundant a list
## ---- cropping from the ending time --------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_end_list = [pre_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
# avoid the cropping warning
crop_start_list = [crop_start if crop_start > pre_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
# note: fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
whitened_fft_PBG_ind_list = [pre_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list
whitened_fft_PBG_backwarding = np.concatenate(whitened_fft_PBG_ind_list, axis=None)
del whitened_fft_PBG_ind_list # delete redundant a list
whitened_fft_PBG = np.append(whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding)
del whitened_fft_PBG_forwarding, whitened_fft_PBG_backwarding
else: # if the preceding BG duration is the same as that of the target segment
fft_PBG = pre_background.asd(DURATION, method='welch').value
# whitening the preceding BG using the average BG
whitened_fft_PBG = fft_PBG / ave_background_asd
whitened_fft_FBG = None # it is not calculated
elif pre_background_start == 0 and fol_background_start != 0: # if there is an available following background
# retrieve the preceding and following backgrounds
if fol_background_end > full_timeseries_end:
fol_background_end = full_timeseries_end
fol_background = full_timeseries.crop(start=fol_background_start, end=fol_background_end)
try:
fol_background_asd = fol_background.asd(DURATION) # ASD of the following BG
except ValueError:
raise ValueError
ave_background_asd = fol_background_asd # average of those two BG ASD
ave_background_asd = ave_background_asd.value # convert it to numpy array
whitened_fft_PBG = None # it is not calculated
if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment
# number of BG trials
iterate = int(fol_background.duration.value / DURATION)
# redefine the start and end times of the following BG to avoid cropping warning
fol_background_start, fol_background_end = fol_background.span[0], fol_background.span[1]
## ----- cropping from the starting time ------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_start_list = [fol_background_start + i * DURATION for i in list_index_trials] # a list of starting times cropped segments
crop_end_list = [crop_start + DURATION for crop_start in crop_start_list] # a list of ending times of cropped segments
# avoid the cropping warning
crop_end_list = [crop_end if crop_end < fol_background_end else fol_background_end for crop_end in crop_end_list] # if end time is replaced with the end of the BG segments if it is larger
# note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
# calculate each whitened FFT
whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list # delete redundant lists
whitened_fft_FBG_forwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
del whitened_fft_FBG_ind_list # delete redundant a list
## ------ cropping from the ending time --------
# take some fraction of trials
list_index_trials = select_some_trials(iterate)
crop_end_list = [fol_background_end - i * DURATION for i in list_index_trials] # a list of ending times of cropped segments
crop_start_list = [crop_end - DURATION for crop_end in crop_end_list] # a list of starting times cropped segments
# avoid the cropping warning
crop_start_list = [crop_start if crop_start > fol_background_start else pre_background_start for crop_start in crop_start_list] # if start time is replaced with the start of the BG segments if it is smaller
# note: fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
whitened_fft_FBG_ind_list = [fol_background.crop(start=crop_start, end=crop_end).asd(DURATION, method='welch').value / ave_background_asd for crop_start, crop_end in zip(crop_start_list, crop_end_list)]
del crop_start_list, crop_end_list
whitened_fft_FBG_backwarding = np.concatenate(whitened_fft_FBG_ind_list, axis=None)
del whitened_fft_FBG_ind_list # delete redundant a list
whitened_fft_FBG = np.append(whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding)
del whitened_fft_FBG_forwarding, whitened_fft_FBG_backwarding
else: # if the following BG duration is the same as that of the target segment
fft_FBG = fol_background.asd(DURATION, method='welch').value
# whitening the following BG using the average BG
whitened_fft_FBG = fft_FBG / ave_background_asd
# whitening Target
target_timeseries = full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
fft_target = target_timeseries.asd(DURATION, method='welch').value # fft of the target
# whitening the target using the average BG
whitened_fft_target = fft_target / ave_background_asd
return whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION
[docs]def Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end):
'''
description:
This is used for multi processing for whitening segments and puts out the absolute values of whitened timeseries of the on- and off-source windows
instead of putting out of the frequency series.
The the deviation of the whitened timeseries is imformative so that it calculates absolute values
This function is used to study the metric evaluated in the time domain compared with the metric evaluated in the frequency domain. In conclusion, the metric in the frequency domain is better so that this function is useless.
USAGE: whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION = Multiprocess_whitening_timeseries(full_timeseries, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end)
:param full_timeseries: time series comprising target and BGs
:param target_timeseries_start: a start time of a target segment
:param target_timeseries_end: an end time of a target segment
:param pre_background_start: a start time of a preceding BG
:param pre_background_start: an end time of a preceding BG
:param pre_background_start: a start time of a following BG
:param pre_background_start: an end time of a following BG
:return: whitened_target_timeseries_abs: the absolute values of whitened timeseries in the on-source window
whitened_pre_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window
whitened_fol_off_source_abs: the absolute values of whitened timeseries in the preceding off-source window
sample_rate: sampling rate of this channel
DURATION: a duration of a target segment
'''
sample_rate = full_timeseries.sample_rate.value
DURATION = target_timeseries_end - target_timeseries_start
fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number
DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# revise the duration to make an error in whitening process not not to occur
# ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point
elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
pass # keep the original duration as it is
DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
pre_background_duration = pre_background_end - pre_background_start
fol_background_duration = fol_background_end - pre_background_end
pre_background_asd_weight = pre_background_duration / (pre_background_duration + fol_background_duration)
fol_background_asd_weight = fol_background_duration / (pre_background_duration + fol_background_duration)
pre_background_duration, fol_background_duration
# -----------------
try:
pre_timeseries = full_timeseries.crop(end=pre_background_end)
fol_timeseries = full_timeseries.crop(start=fol_background_start)
pre_background_asd = pre_timeseries.asd(DURATION, method='welch') # ASD of the preceding BG
fol_background_asd = fol_timeseries.asd(DURATION, method='welch') # ASD of the following BG
except ValueError:
raise ValueError
ave_background_asd = pre_background_asd_weight * pre_background_asd + fol_background_asd_weight * fol_background_asd # weighed average of those two BG ASD
del pre_background_asd_weight, fol_background_asd_weight
whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION, asd=ave_background_asd)
#---------------------
#whitened_full_timeseries = full_timeseries.whiten(fftlength=DURATION)
if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available
################3
whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration , end=pre_background_end)
whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration)
# take absolute value
whitened_pre_off_source_abs = whitened_pre_off_source.abs().value
whitened_fol_off_source_abs = whitened_fol_off_source.abs().value
elif pre_background_start != 0 and fol_background_start == 0: # if there is an available preceding background
whitened_pre_off_source = whitened_full_timeseries.crop(start=pre_background_start + 0.05 * pre_background_duration, end=pre_background_end)
whitened_fol_off_source = None
# take absolute value
whitened_pre_off_source_abs = whitened_pre_off_source.abs().value
elif pre_background_start == 0 and fol_background_start != 0: # if there is an available following background
whitened_pre_off_source = None
whitened_fol_off_source = whitened_full_timeseries.crop(start=fol_background_start, end=fol_background_end - 0.05 * fol_background_duration)
# take absolute value
whitened_fol_off_source_abs = whitened_fol_off_source.abs().value
# whitening Target
whitened_target_timeseries = whitened_full_timeseries.crop(start=target_timeseries_start, end=target_timeseries_end)
# take absolute value
whitened_target_timeseries_abs = whitened_target_timeseries.abs().value
return whitened_target_timeseries_abs, whitened_pre_off_source_abs, whitened_fol_off_source_abs, sample_rate, DURATION
#
# def Multiprocess_subtracting(target_timeseries, pre_background, fol_background):
# '''
# description:
# This is used for multi processing for subtracting segments
# :param ch_label: channel label
# :param target_timeseries: time series of a target segment
# :param pre_background: time series of a preceding BG
# :param fol_background: time sereis of a following BG
# :param pre_background_start: start time of a preceding BG
# :param pre_background_end: end time of a preceding BG
# :param fol_background_start: start time of a following BG
# :return: whitened_fft_target: whitened fft of a target segment
# whitened_fft_PBG: whitened fft of a preceding segment
# whitened_fft_FBG: whitened fft of a following segment
# sample_rate: sampling rate of this channel
# DURATION: a duration of a target segment
# '''
# sample_rate = target_timeseries.sample_rate.value
# DURATION = target_timeseries.duration.value
# fourth_decimal_num_of_duration = (DURATION * 1000 - floor(DURATION * 1000)) * 10 # fourth decimal number
# DURATION = DURATION - fourth_decimal_num_of_duration * 0.0001 # truncate at the fourth decimal number
# # revise the duration to make an error in whitening process not not to occur
# # ValueError occurs in the whitening when the lengths of whitened data, whitening data and the filter are different
# third_decimal_num_of_duration = round((round(DURATION, 3) - round(DURATION, 2)) * 1000)
# if third_decimal_num_of_duration % 2 == 1: # odd number at the third decimal point
# DURATION = DURATION - 0.001 # make the duration with an even number at the third decimal point
# elif third_decimal_num_of_duration % 2 == 0: # even number at the third decimal point
# pass # keep the original duration as it is
# DURATION = round(DURATION, 3) # round it up at 3rd decimal point as the above three lines creates small number below the 3rd decimal point
# pre_background_start = pre_background.span[0] # start time of a preceding BG
# pre_background_end = pre_background.span[1] # end time of a preceding BG
# fol_background_start = fol_background.span[0] # start time of a following BG
# fol_background_end = fol_background.span[1] # end time of a following BG
# if pre_background_start != 0 and fol_background_start != 0: # if both BGs are available
# # average background ASD
# try:
# pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs()# averaged fft of the preceding BG
# fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs() # averaged fft of the following BG
# except ValueError:
# raise ValueError
# ave_background_fft = (pre_background_fft + fol_background_fft) / 2. # average of those two BG ASD
# # for the preceding BG
# if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
# res_fft_PBG = np.array([]) # residual
# # number of BG trials
# iterate = int(pre_background.duration.value / DURATION)
# for i in range(iterate): # iterate though the number of trials
# # iterate from the starting time of the preceding BG
# crop_start = pre_background_start + i * DURATION # start time of the cropping
# crop_end = crop_start + DURATION # ending time of of the cropping
# fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
# res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG
# res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
# # iterate from the ending time of the preceding BG
# crop_end = pre_background_end - i * DURATION # ending time of of the cropping
# crop_start = crop_end - DURATION # start time of the cropping
# fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
# res_fft_PBG_ind = fft_PBG - ave_background_fft # calcurate a residual of the cropped BG from the averaged BG
# res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
# else: # if the preceding BG duration is the same as that of the target segment
# fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs()
# # whitening the preceding BG using the average BG
# res_fft_PBG = fft_PBG - ave_background_fft
# # for the following BG
# if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment
# res_fft_FBG = np.array([])
# # number of BG trials
# iterate = int(fol_background.duration.value / DURATION)
# for i in range(iterate): # iterate though the number of trials
# # iterate from the starting time of the following BG
# crop_start = fol_background_start + i * DURATION # start time of the cropping
# crop_end = crop_start + DURATION # ending time of of the cropping
# fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
# res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG
# res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
# # iterate from the ending time of the following BG
# crop_end = fol_background_end - i * DURATION # ending time of of the cropping
# crop_start = crop_end - DURATION # start time of the cropping
# fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs().value # crop and fft of it
# res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG
# res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
# else: # if the following BG duration is the same as that of the target segment
# fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs()
# # whitening the following BG using the average BG
# res_fft_FBG = fft_FBG - ave_background_fft
# elif pre_background_start != 0 and fol_background_start == 0: # if there is an available proceeding background
# # fft of a preceding BG
# try:
# pre_background_fft = pre_background.average_fft(DURATION, window='hanning').abs() # fft of the preceding BG
# except ValueError:
# raise ValueError
# ave_background_fft = pre_background_fft # average of those two BG ASD
# # for the preceding BG
# if pre_background.duration.value > DURATION: # if the preceding BG duration is longer than that of the target segment
# res_fft_PBG = np.array([])
# # number of BG trials
# iterate = int(pre_background.duration.value / DURATION)
# for i in range(iterate): # iterate though the number of trials
# # iterate from the starting time of the preceding BG
# crop_start = pre_background_start + i * DURATION # start time of the cropping
# crop_end = crop_start + DURATION # ending time of of the cropping
# fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop and fft of it
# res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
# res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
# # iterate from the ending time of the preceding BG
# crop_end = pre_background_end - i * DURATION # ending time of of the cropping
# crop_start = crop_end - DURATION # start time of the cropping
# fft_PBG = pre_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
# res_fft_PBG_ind = fft_PBG - ave_background_fft # calculate a residual of the cropped BG using the averaged BG
# res_fft_PBG = np.append(res_fft_PBG, res_fft_PBG_ind) # append to the whole trials array
# else: # if the preceding BG duration is the same as that of the target segment
# fft_PBG = pre_background.crop(start=pre_background_end - DURATION).average_fft(DURATION, window='hanning').abs()
# # whitening the preceding BG using the average BG
# res_fft_PBG = fft_PBG - ave_background_fft
# whitened_fft_FBG = None # it is not calculated
# elif pre_background_start == 0 and fol_background_start != 0: # if there is an available proceeding background
# try:
# fol_background_fft = fol_background.average_fft(DURATION, window='hanning').abs() # fft of the following BG
# except ValueError:
# raise ValueError
# ave_background_fft = fol_background_fft # average of those two BG ASD
# whitened_fft_PBG = None # it is not calculated
# if fol_background.duration.value > DURATION: # if the following BG duration is longer than that of the target segment
# res_fft_FBG = np.array([])
# # number of BG trials
# iterate = int(fol_background.duration.value / DURATION)
# for i in range(iterate): # iterate though the number of trials
# # iterate from the starting time of the following BG
# crop_start = fol_background_start + i * DURATION # start time of the cropping
# crop_end = crop_start + DURATION # ending time of of the cropping
# fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
# res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
# res_fft_FBG = np.append(res_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
# # iterate from the ending time of the following BG
# crop_end = fol_background_end - i * DURATION # ending time of of the cropping
# crop_start = crop_end - DURATION # start time of the cropping
# fft_FBG = fol_background.crop(start=crop_start, end=crop_end).average_fft(DURATION, window='hanning').abs() # crop it
# res_fft_FBG_ind = fft_FBG - ave_background_fft # calculate a residual of the cropped BG from the averaged BG
# whitened_fft_FBG = np.append(whitened_fft_FBG, res_fft_FBG_ind) # append to the whole trials array
# else: # if the following BG duration is the same as that of the target segment
# fft_FBG = fol_background.crop(start=fol_background_start, end=fol_background_start + DURATION).average_fft(DURATION, window='hanning').abs()
# # whitening the following BG using the average BG
# res_fft_FBG = fft_FBG - ave_background_fft
# # whitening Target
# fft_target = target_timeseries.average_fft(DURATION, window='hanning').abs() # fft of the target
# # whitening the target using the average BG
# res_fft_target = fft_target - ave_background_fft
# return res_fft_target, res_fft_PBG, res_fft_FBG, sample_rate, DURATION
[docs]def SaveTargetAndBackGroundHDF5_OFFLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
'''
description:
THIS IS USED FOR "OFFLINE" MODE
0. assuming Listsegments is given by Findglitchlist()
1.take the information of the list of allowed target and a preceding and following segments
2. whiten a target segmement based on the average background segment
3. find whitened FFT
4. save the whitened target and backgrounds FFTs
Note this depends on
USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')
:param Listsegments: a list of segment parameters
:param channels: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
'''
# check if the output directory exists
if os.path.isdir(outputpath) is False:
print("The directory for the output HDF5 file does not exist: path = {}".format(outputpath))
print("Please specify the existing directory for it.")
sys.exit()
# the list of the safe channels
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
sys.exit()
# open a HDF5 file in writing mode
with h5py.File(outputPathFileName, 'r+') as f:
for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
t0 = time.time()
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
channels_used = channels
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
# =====================================================
except IndexError:
print('Bad data, go to a next GPS')
continue # go to a next GPS time
except RuntimeError:
print('RuntimeError, go to a next GPS')
continue
except IOError: # error when getting time series
print('IOError, go to a next GPS')
continue
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError, go to a next GPS')
continue
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = channels[0].split(':')[0]
try:
ch_labels = range(len(channels_used)) # a list of channel labels
List_whitened_fft_target = [] # a list of whitened fft of target
List_whitened_fft_PBG = [] # a list of whitened fft of PBG
List_whitened_fft_FBG = [] # a list of whitened fft of FBG
List_sample_rate = [] # a list of sample rates
List_DURATION = [] # a list of durations
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
itrtls.repeat(pre_background_start),
itrtls.repeat(pre_background_end),
itrtls.repeat(fol_background_start),
itrtls.repeat(fol_background_end)
):
List_whitened_fft_target.append(whitened_fft_target) # fill
List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
List_sample_rate.append(sample_rate) # fill
List_DURATION.append(DURATION) # fill
for ch_label in ch_labels: # iterate through channel labels
dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target
dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target
dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target
if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the preceding BG is NOT available
pass
if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the following BG is NOT available
pass
except ValueError:
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
continue # go to a next GPS time
except IOError: # IOError: Driver write request failed
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
continue # go to a next GPS time
print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
[docs]def SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT):
'''
description:
THIS IS USED FOR "ONLINE" MODE
0. assuming Listsegments is given by Findglitchlist()
1.take the information of the list of allowed target and a preceding and following segments
2. whiten a target segmement based on the average background segment
3. find whitened FFT
4. save the whitened target and backgrounds FFTs
Note this depends on Multiprocess_whitening()
USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')
:param Listsegments: a list of segment parameters
:param channels: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}
'''
# the list of the safe channels
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file and open it
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
sys.exit()
with h5py.File(outputPathFileName, 'r+') as f:
for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
t0 = time.time()
# extracting the target data and the proceeding and following backgrounds
try: # deal with time series in bad state
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
channels_used = channels
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels_used = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
except IndexError:
# IndexError: After reading object of type: FrAdcData stream is no longer in good state. ( Frmae: 0 ChannelType: L1:ASC-X_TR_A_NSUM_OUT_DQ offset: 932092099)
continue # go to a next GPS time
except RuntimeError:
continue # go to a next GPS time
except IOError: # error when getting time series
print('IOError, go to a next GPS')
continue
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError, go to a next GPS')
continue
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = channels[0].split(':')[0]
try:
ch_labels = range(len(channels_used)) # a list of channel labels
List_whitened_fft_target = [] # a list of whitened fft of target
List_whitened_fft_PBG = [] # a list of whitened fft of PBG
List_whitened_fft_FBG = [] # a list of whitened fft of FBG
List_sample_rate = [] # a list of sample rates
List_DURATION = [] # a list of durations
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for whitened_fft_target, whitened_fft_FBG, whitened_fft_PBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
itrtls.repeat(pre_background_start),
itrtls.repeat(pre_background_end),
itrtls.repeat(fol_background_start),
itrtls.repeat(fol_background_end)
):
List_whitened_fft_target.append(whitened_fft_target) # fill
List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
List_sample_rate.append(sample_rate) # fill
List_DURATION.append(DURATION) # fill
for ch_label in ch_labels: # iterate through channel labels
dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target
dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
dsetTarget.attrs['channel'] = channels_used[ch_label].split(':')[1] # target
dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target
if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the preceding BG is NOT available
pass
if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels_used[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the following BGfull_timeseriesDict is NOT available
pass
except ValueError:
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
continue
except IOError: # IOError: Driver write request failed
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
continue # go to a next GPS time
yield g # pass a group object as a generator
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
[docs]def SaveTargetAndBackGroundHDF5_TimeShift(Listsegments, re_sfchs, IFO, outputpath, outputfilename, number_process, PlusHOFT='False'):
'''
description:
THIS IS USED FOR "OFFLINE" MODE
0. assuming Listsegments is given by Findglitchlist()
1.take the information of the list of allowed target and a preceding and following segments
2. whiten a target segmement based on the average background segment
3. find whitened FFT
4. save the whitened target and backgrounds FFTs
Note this depends on
USAGE: SaveTargetAndBackGroundHDF5(Listsegments, re_sfchs, IFO, outputpath, outputfilename, mode=='offline')
:param Listsegments: a list of segment parameters
:param channels: a list of safe channels
:param outputpath: a directory of an output file
:param outputfilename: a name of an output file
:param number_process: a number of processes in parallel
:param PlusHOFT: whether to get data of hoft, {'True' or 'False'}, 'False' in default
'''
# the list of the safe channels
channels = ['{0}:{1}'.format(IFO, ch) for ch in re_sfchs]
# the path and the file name
outputPathFileName = os.path.join(outputpath, outputfilename)
# make an output file
try:
f = h5py.File(outputPathFileName, 'w-')
f.close()
except IOError:
print('The file already exists in {}'.format(outputPathFileName))
print('Please choose a different file name in a configuration file or delete this file if you want to overwrite it')
sys.exit()
# open a HDF5 file in writing mode, create a file if not exist yet
with h5py.File(outputPathFileName, 'r+') as f:
for IndexSatisfied, target_timeseries_start, target_timeseries_end, pre_background_start, pre_background_end, fol_background_start, fol_background_end, gpstime, duration, SNR, confi, ID in Listsegments:
t0 = time.time()
try: # deal with time series in bad state
if IndexSatisfied == 0:
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, pre_background_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, fol_background_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], pre_background_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, fol_background_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
previous_full_timeseriesDict = full_timeseriesDict
previous_pre_background_start = pre_background_start
else:
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict = TimeSeriesDict.find(channels, target_timeseries_start, target_timeseries_end, frametype='{}_R'.format(IFO), verbose=False, nproc=number_process)
else:
pass
if PlusHOFT == 'True': # get the data of hoft
if pre_background_start != 0 and pre_background_end != 0 and fol_background_start != 0 and fol_background_end != 0: # if there is an available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start != 0 and pre_background_end != 0 and fol_background_start == 0 and fol_background_end == 0: # if there is an available proceeding background and if there is NO available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
elif pre_background_start == 0 and pre_background_end == 0 and fol_background_start != 0 and fol_background_end != 0: # if there is NO available proceeding background and if there is an available following background
full_timeseriesDict_hoft = TimeSeriesDict.find(['{}:GDS-CALIB_STRAIN'.format(IFO)], target_timeseries_start, target_timeseries_end, frametype='{}_HOFT_C00'.format(IFO), verbose=False, nproc=number_process)
else:
pass
full_timeseriesDict.append(full_timeseriesDict_hoft) # append
del full_timeseriesDict_hoft # delete the hoft object
channels = np.append(channels, '{}:GDS-CALIB_STRAIN'.format(IFO))
full_timeseriesDict.prepend(previous_full_timeseriesDict.crop(start=previous_pre_background_start))
previous_full_timeseriesDict = full_timeseriesDict
previous_pre_background_start = pre_background_start
# =====================================================
except IndexError:
print('Bad data, go to a next GPS')
continue # go to a next GPS time
except RuntimeError:
print('RuntimeError, go to a next GPS')
continue
except IOError: # error when getting time series
print('IOError, go to a next GPS')
except ValueError: # ValueError: Failed to read /hdfs/frames/O3/raw/H1/H-H1_R-12384/H-H1_R-1238489600-64.gwf: Unable to map from the stream class id (0) to internal class id at position: 18446744073709551601 [streamref: class: 0 instance: 0 length: 0 ]
print('ValueError, go to a next GPS')
t2 = time.time()
print('Time taken for query: {} s'.format(t2 - t0))
g = f.create_group('gps{:05d}'.format(IndexSatisfied))
g.attrs['GPS'] = gpstime # a peak GPS time
g.attrs['duration'] = duration
g.attrs['snr'] = SNR # snr
g.attrs['confidence'] = confi # confidence
g.attrs['id'] = ID
g.attrs['ifo'] = channels[0].split(':')[0]
try:
ch_labels = range(len(channels)) # a list of channel labels
List_whitened_fft_target = [] # a list of whitened fft of target
List_whitened_fft_PBG = [] # a list of whitened fft of PBG
List_whitened_fft_FBG = [] # a list of whitened fft of FBG
List_sample_rate = [] # a list of sample rates
List_DURATION = [] # a list of durations
# whitened target and BG segments with multi processes
with concurrent.futures.ProcessPoolExecutor(max_workers=number_process) as executor:
for whitened_fft_target, whitened_fft_PBG, whitened_fft_FBG, sample_rate, DURATION in executor.map(Multiprocess_whitening,
full_timeseriesDict.values(),
itrtls.repeat(target_timeseries_start),
itrtls.repeat(target_timeseries_end),
itrtls.repeat(pre_background_start),
itrtls.repeat(pre_background_end),
itrtls.repeat(fol_background_start),
itrtls.repeat(fol_background_end)
):
List_whitened_fft_target.append(whitened_fft_target) # fill
List_whitened_fft_PBG.append(whitened_fft_PBG) # fill
List_whitened_fft_FBG.append(whitened_fft_FBG) # fill
List_sample_rate.append(sample_rate) # fill
List_DURATION.append(DURATION) # fill
for ch_label in ch_labels: # iterate through channel labels
dsetTarget = g.create_dataset("chT{:05d}".format(ch_label), data=List_whitened_fft_target[ch_label], compression="gzip", compression_opts=9) # target
dsetTarget.attrs['SamplingRate'] = List_sample_rate[ch_label] # target
dsetTarget.attrs['channel'] = channels[ch_label].split(':')[1] # target
dsetTarget.attrs['duration'] = List_DURATION[ch_label] # target
if List_whitened_fft_PBG[ch_label] != None: # if the preceding BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chP{:05d}".format(ch_label), data=List_whitened_fft_PBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the preceding BG is NOT available
pass
if List_whitened_fft_FBG[ch_label] != None: # if the following BG is available, this condition is required for the "Either" BG selection
dsetBG = g.create_dataset("chF{:05d}".format(ch_label), data=List_whitened_fft_FBG[ch_label], compression="gzip", compression_opts=9) # BG
dsetBG.attrs['SamplingRate'] = List_sample_rate[ch_label] # BG
dsetBG.attrs['channel'] = channels[ch_label].split(':')[1] # BG
dsetBG.attrs['duration'] = List_DURATION[ch_label] # BG
else: # if the following BG is NOT available
pass
except ValueError:
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to ASD estimation error ***'.format(IndexSatisfied))
continue # go to a next GPS time
except IOError: # IOError: Driver write request failed
del f['gps{:05d}'.format(IndexSatisfied)]
print('*** delete group gps{:05d} due to memory error ***'.format(IndexSatisfied))
continue # go to a next GPS time
print('Complete saving a sample as gps{:05d}'.format(IndexSatisfied))
t1 = time.time()
print('Time taken: {} s'.format(t1 - t0))
#+++++++++++++++++++++++++++++++++++++++++++++++
# plot tools
#+++++++++++++++++++++++++++++++++++++++++++++++
[docs]class PlotTableAnalysis():
def __init__(self):
self.f = None # a HDF5 file storing a glitch class
[docs] def getHDF5Object(self):
'''
show a HDF5 file object
USAGE: getHDF5Object()
:return: a dictionary
'''
if self.f == None:
print('A HDF5 file object is not set yet')
else:
return self.f
[docs] def setHDF5Object(self, input_dir, input_file):
'''
set a HDF5 file object
:param f_dict: a dictionary of time series data sets
:return: None
'''
input_path = os.path.join(input_dir, input_file)
f_input = h5py.File(input_path, 'r') # read it in the read mode
self.f = f_input
[docs] def FindNumberSample(self):
'''
description:
find the number of samples
:return: the number of samples
'''
number_sample = len(list(self.f.keys()))
return number_sample
[docs] def FindNumberChannels(self, g):
'''
description:
count a number of channels that are analyzed
USAGE: NumberOfChannels = FindNumberChannels(g)
:param g: a HDF file group object
:return: NumberOfChannels: a number of channels that are analyzed
'''
TPF_list = np.array([str(dset)[2] for dset in list(g.keys())])
NumberOfChannels = len(np.where(TPF_list == 'T')[0])
return NumberOfChannels
[docs] def HierarchyChannelAboveThreshold(self, g, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None'):
'''
description:
calculate values of importance which is the number of frequency bins above a the off-source window for channels for a given time of a glitch
USAGE: RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq, sigma=10)
:param g: (hdf5 format) a group having a glitch
:param pt: a value of area integrating a distribution upto
:param sigma: value of standard deviation of ratio of medians to determine important channels
:return:
RankingChannelAndRatioMed: # a list of channel names with their ratios descended by ratios
Importantchannels: # channels with their ratio is greater than the threshold of ratio, along with their ratios
'''
SNR = g.attrs['snr']
GPS = g.attrs['GPS']
confidence = g.attrs['confidence']
ID = g.attrs['id']
DURATION = g.attrs['duration']
ListChannelName = np.array([])
ListCount = np.array([])
NumberOfChannels = self.FindNumberChannels(g)
for i in range(NumberOfChannels):
dsetT = g['chT{:05d}'.format(i)] # a target data set
try: # this is required in the condition for "Either" BG selection
dsetP = g['chP{:05d}'.format(i)] # a preceding BG data set
except KeyError:
dsetP = None
try: # this is required in the condition for "Either" BG selection
dsetF = g['chF{:05d}'.format(i)] # a following BG data set
except KeyError:
dsetF = None
ChannelName = dsetT.attrs['channel'].decode('utf-8') # channel name
duration = dsetT.attrs['duration'] # duration
sampling_rate = dsetT.attrs['SamplingRate'] # sampling rate of this channel
spT = dsetT[()] # target data set pixel value
if dsetP != None: # this is required in the condition for "Either" BG selection
spP = dsetP[()] # a preceding BG data set pixel value
else:
spP = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
if dsetF != None: # this is required in the condition for "Either" BG selection
spF = dsetF[()] # a following BG data set pixel value
else:
spF = np.array([[0]]) # a trick to be used in the line, if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True:
#=======================
if dsetP != None:
Number_PBG_trials = spP.shape[0] / spT.shape[0]
spP = spP.reshape(int(Number_PBG_trials), spT.shape[0])
if dsetF != None:
Number_FBG_trials = spF.shape[0] / spT.shape[0]
spF = spF.reshape(int(Number_FBG_trials), spT.shape[0])
#=======================
#++++++ for the case where there is no data measured at a channel +++++++++++++++++++
#++++++ That channel has no contribution to h(t) so that we can define Count = 0 +++++++++++
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
if np.isnan(spT[0]) != True and np.isnan(spP[:,0]).any() != True and np.isnan(spP[:,0]).any() != True: # if target and BG have data measured
pT = spT #** 2 # power of target
if dsetP != None:
pP = spP #** 2 # power of a preceding BG
if dsetF != None:
pF = spF #** 2 # power of a following BG
# low, high, or band pass filter if we know which frequency band is important
if LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None': # high-pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
pT = np.array([0])
pP = np.array([0])
pF = np.array([0])
else:
LowerCutOffElement = int(LowerCutOffFreq*duration)
if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
pT = pT[LowerCutOffElement:]
if dsetP != None:
pP = pP[:, LowerCutOffElement:]
if dsetF != None:
pF = pF[:, LowerCutOffElement:]
else: # if LowerCutOffElement is greater than the length of the vector
pT = np.array([0])
if dsetP != None:
pP = np.array([0])
if dsetF != None:
pF = np.array([0])
elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None': # low-pass filter
# even if UpperCutOffElement is greater than the length of the vector is works fine
UpperCutOffElement = int(UpperCutOffFreq * duration)
pT = pT[:UpperCutOffElement]
if dsetP != None:
pP = pP[:, :UpperCutOffElement]
if dsetF != None:
pF = pF[:, :UpperCutOffElement]
elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None': # band pass filter
## pT, pP, and pF should be zero if LowerCutOffFreq is greater than the half of the sampling rate as there is no data. However pT would be non-zero due to the bad resolution (a very short giltch)
if sampling_rate / 2 <= LowerCutOffFreq:
pT = np.array([0])
pP = np.array([0])
pF = np.array([0])
else:
LowerCutOffElement = int(LowerCutOffFreq * duration)
UpperCutOffElement = int(UpperCutOffFreq * duration)
if LowerCutOffElement <= pT.size: # if LowCutOffElement is within the length of the vector
pT = pT[LowerCutOffElement:UpperCutOffElement]
if dsetP != None:
pP = pP[:, LowerCutOffElement:UpperCutOffElement]
if dsetF != None:
pF = pF[:, LowerCutOffElement:UpperCutOffElement]
else: # if LowCutOffElement is greater than the length of the vector
pT = np.array([0])
if dsetP != None:
pP = np.array([0])
if dsetF != None:
pF = np.array([0])
else:
pass
if dsetP != None and dsetF != None:
pB = np.concatenate((pP, pF), axis=0)
elif dsetP != None and dsetF == None:
pB = pP
elif dsetP == None and dsetF != None:
pB = pF
GausCeiling = pB.mean() + sigma * pB.std(ddof=1) # gaussian noise ceiling
########a metric with amplitude ratio ########
#try:
# GausCeiling = pB.sum(axis=1).mean() + pB.sum(axis=1).std(ddof=1)
#except ValueError: # if pB is [0 0]
# GausCeiling = 0
##################################################
#pt = 0.95
#histB, edgB = np.histogram(pB, bins=100, density=True) # histgram of the background
#deltaB = edgB[1] - edgB[0] # the length of the bin
#cumnumB = np.cumsum(histB * deltaB) # cumulative number
#UpperBoundB = edgB[np.where(cumnumB > pt)[0][0]:np.where(cumnumB > pt)[0][0] + 2].mean() # median value
# Note that even if pT, pB do not have elements in them, UpperBoundB becomes = 0.0050 so that there is no element in pT above the value as
# I set pT = np.array([0]) in this case. Eventually, Count = 0, which is ok
#Count = pT[pT > UpperBoundB].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling
############################
try:
Count = pT[pT > GausCeiling].shape[0] / float(pT.shape[0]) # a fraction of components above the ceiling
########### a metric with amplitude ratio ###############
#Count = pT.sum() / GausCeiling
#if Count < 0:
# Count = 0
except ZeroDivisionError: # in the situation where the number of the total elments is zero, I saw GausCeiling is nan. So this channel has mulfunction. I manually set Count = 0
Count = 0
# if the data is flat over the stretch of the segment, the upper bound threshold of BG called "GausCeiling" becomes zero (BG has to flactuate so that "GausCeiling" equal to zero is fake)
# if so that importance called "Count" becomes 1. To prevent this fake importance, I set Count = 0 manually
if GausCeiling == 0.:
Count = 0
else:
Count = 0 # manually put zero because this channel has no contribution at this glitch ID
ListCount = np.append(ListCount, Count)
ListChannelName = np.append(ListChannelName, ChannelName)
#+++++++++ find a hierarchy of channels +++++++++++++++++++++
RankingChannelAndCount = [[ListChannelName[i], ListCount[i]] for i in np.argsort(ListCount)[::-1]] # a list of channel names with maximum value of SNR series
return RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, DURATION
[docs] def CreateMatCount(self, sigma, g_Individual = None, LowerCutOffFreq='None', UpperCutOffFreq='None'):
'''
description:
1. find counts for each glitch
2. stack over all the gliches
3. make a matix comprising importance versus channels
USAGE: MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration = CreateMatCount(sigma, LowerCutOffFreq='None', UpperCutOffFreq='None') # for all glitches
MatCount, ListChannelName, ListSNR, ListConf = CreateMatCount(sigma, g_Individual, LowerCutOffFreq='None', UpperCutOffFreq='None') # for an individual glitch
:param sigma sigma: an integer to determine the upper bound of the off-source window
:param g_Individual: a grounp of a HDF5 file that has values of importance for a gltich
:param LowerCutOffFreq: a lower limit frequency cut to calculate a value of importance
:param UpperCutOffFreq: an uppper limit frequency cut to calculate a value of importance
:return:
MatCount: a matrix comprising importance versus channels
ListChannelName: a list of channel names
ListSNR: a list of SNRs
ListConf: a list of confidence
dependencies: self.HierarchyChannelAboveThreshold(g, LowerCutOffFreq, UpperCutOffFreq)
'''
if g_Individual == None: # for all the glitches in a glitch class
MatCount = np.array([])
ListSNR = np.array([])
ListConf = np.array([])
ListGPS = np.array([])
ListDuration = np.array([])
num_dataset = 0 # a number of datasets in a group, set to be 0 as a first instance
for g_label in list(self.f.keys()): # iterate through all the groups
try:
g = self.f[g_label]
except KeyError: # KeyError: 'Unable to open object (Bad object header version number)'
continue
# the following if statement is for the solution where no dataset in a group
if len(g.keys()) < num_dataset: # if a number of dataset in this group is less than that of the previous group
continue # go to the next iteration
num_dataset = len(g.keys()) # update a number of datasets with that of the current group
# get an output given by HierarchyChannelAboveThreshold()
RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g, sigma, LowerCutOffFreq, UpperCutOffFreq)
# stack the vector of counts
MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
# append SNR
ListSNR = np.append(ListSNR, SNR)
# append confidence level
ListConf = np.append(ListConf, confidence)
# append GPS time
ListGPS = np.append(ListGPS, GPS)
# append duration
ListDuration = np.append(ListDuration, duration)
return MatCount, ListChannelName, ListSNR, ListConf, ListGPS, ListDuration
else: # individual glitch
MatCount = np.array([])
ListSNR = np.array([])
ListConf = np.array([])
# get an output given by HierarchyChannelAboveThreshold()
RankingChannelAndCount, ListCount, ListChannelName, GPS, ID, SNR, confidence, duration = self.HierarchyChannelAboveThreshold(g_Individual, sigma, LowerCutOffFreq, UpperCutOffFreq)
# stack the vector of counts
MatCount = np.vstack([MatCount, ListCount]) if MatCount.size else ListCount
# append SNR
ListSNR = np.append(ListSNR, SNR)
# append confidence level
ListConf = np.append(ListConf, confidence)
# normalize MatCount by a number of glitches
MatCount = MatCount
return MatCount, ListChannelName, ListSNR, ListConf
[docs] def CreateChannelTicks(self, ListChannelName):
'''
description:
1. take the dominant sub-channel names
2. get a list of index where a sub-sensor name changes
dependencies: (tacitly) CreateMatCount(), make_subset_channel_based_on_samplingrate()
USAGE: CenterTicks, ListInd, ListSubsys = CreateChannelTicks(ListChannelName)
:param ListChannelName: a list of channel names
:return:
CenterTicks: center value of a dominant sub sensor belongs
ListInd: the edge indices of a dominant sub sensor belongs
ListSubsys: a list of dominant sensor names
'''
# +++ take the dominant sub-channel names +++++++++
ListSubsys = np.array([]) # a list of dominant sub sensor names
for i in ListChannelName: # iterate over channel names
Subsys = i.split('-')[0] # take before '-'
ListSubsys = np.append(ListSubsys, Subsys) # append
# ++++ get a list of index where a sub-sensor name changes +++++++++++
i = 'dummy'
ListInd = np.array([]) # a list of indices where dominant channel name changes
for ind, j in enumerate(ListSubsys, 0): # iterate dominant sub sensor names
if i != j: # if name differencee occurs
ListInd = np.append(ListInd, ind) # store an index
i = j # rename current dominant sub name
# append the end index
ListInd = np.append(ListInd, np.array([ListSubsys.shape[0] - 1]))
# find the center index of a dominant sub-channel
CenterTicks = np.array([(i + j) / 2. if j - i != 1 else i for i, j in zip(ListInd[:-1], ListInd[1:])])
return CenterTicks, ListInd, ListSubsys
[docs] def PlotImportanceVSChannel(self, MatCount, ListChannelName, output_dir, output_file, ax=None):
'''
description:
- for all glitches in a class, this method works stand-alone
1. convert number to channel names in x ticks
2. color background based on channel types
3. plot a bar showing importance of channels
USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file)
- for an individual glitch in a class, this method is used by ....
1. convert number to channel names in x ticks
2. color background based on channel types
3. plot a bar showing importance of channels
dependencies: CreateChannelTicks() tacitly CreateMatCount()
USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax) # for on-line mode
USAGE: PlotImportanceVSChannel(MatCount, ListChannelName, output_dir, output_file) @ for off-line mode
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param output_dir: (only used for all glitches)
:param output_file: (only used for all glitches)
:param ax: matplotlib.pyplot object (used only for an individual glitch)
:return: None
'''
if ax == None: # plot for all glitches in a class
CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
if MatCount.shape[0] == MatCount.shape[-1]: # for an individual glitch
MatCount = MatCount.reshape(1, MatCount.shape[-1])
#++++++++ plot bar plot about importance of channels ++++++++++++++++
# sum all the counts over glitches
SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0]) # normalize MatCount by a number of glitches
#if MatCount.shape[0] > 1: # if there are more than 1 sample
# STDMatCount = MatCount.std(axis=0, ddof=1) # standard deviation
#else: # if there is only one sample
# STDMatCount = np.array([None]*SumMatCount.shape[0]) # standard deviation
# plot bars about importance
plt.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
#+++++++++++ plot a table of ranked channels +++++++++++++++++++
# sort channel based on the sum of the counts i.e, importance
# a list of sorted Channel names & sorted Importance
RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
#RankChannel = list(zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]], STDMatCount[np.argsort(SumMatCount)[::-1]]))
# make pandas matrix
Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance'))
#Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance', 'Std'))
# add ranking label
Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
# round importance
Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round()
#Impdf['Std'] = Impdf['Std'].astype(float).round(4)
# re-sort columns to make rank labels coming first
cols = Impdf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Impdf = Impdf[cols]
# take the first 5 high ranking channels
dcsummary = Impdf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
#table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top',colWidths=[0.1, 0.6, 0.1, 0.1])
table.set_fontsize(8)
#++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
#++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# set axis label
plt.ylabel('Importance', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 0.8])
# save a plot
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of Importance VS channel')
return None
else: # for an individual glitch
# ++++++++++++++ alter ticks +++++++++++++++++
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
# plot xticks using the dominant sub sensor names
ax.set_xticks(CenterTicks)
ax.set_xticklabels(ListSubsys[ListInd.astype(int)])
# put ticks above
ax.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-35, length=0, labelrotation=90,size=0, labelsize=15)
ax.tick_params(axis='y', labelsize=15)
ax.set_xlim(0, ListSubsys.shape[0])
# color the background either with blakc or gray for each channel
# ++++++++ plot bar plot about importance of channels ++++++++++++++++
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
if MatCount.shape[0] == MatCount.shape[-1]: # for an individual glitch
MatCount = MatCount.reshape(1, MatCount.shape[-1])
# ++++++++ plot bar plot about importance of channels ++++++++++++++++
# sum all the counts over glitches
SumMatCount = np.sum(MatCount, axis=0)
# +++++++++++ plot a table of high ranking channels +++++++++++++++++++
# sort channel based on the sum of the counts i.e, importance
# a list of sorted Channel names & sorted Importance
RankChannel = list(
zip(ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
# make pandas matrix
Impdf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Importance'))
# add ranking label
Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
# round importance
Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make ranking labels coming first
cols = Impdf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Impdf = Impdf[cols]
# take the first 5 high ranking channels
dcsummary = Impdf.iloc[:5]
# plot bars about importance
ax.bar(np.arange(MatCount.shape[-1]), SumMatCount, 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
# plot table on the top of the bar plot
table = ax.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
table.set_fontsize(15)
table.scale(0.5, 2)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# set axis label
ax.set_ylabel('Importance', fontsize=20)
ax.set_xlabel('Auxiliary channel', fontsize=20)
ax.grid(False)
return None
[docs] def PlotOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file):
'''
description:
plot glitch indecies versus channels
USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListGPS: a list of GPS times
:param ListDuration: a list of durations
:param output_dir:
:param output_file:
:return: None
dependencies: CreateChannelTicks()
'''
##########
#MatCount = MatCount[:, ::-1]
#ListChannelName = ListChannelName[::-1]
#############3
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
print('There is only one single glitch. Skip the plot of glitch index VS channels')
return None
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
fig = plt.figure()
# make grid for subplots
gs = GridSpec(9, 30)
# a subplot showing channel position
ax1 = fig.add_subplot(gs[:1, :-1]) # First row, up to the second last column
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
edgeRight = edgeRight + 1
ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
# limit the xaxis of the first subplot
ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
# get rid of ticks of the first plot
ax1.yaxis.set_major_locator(plt.NullLocator())
ax1.xaxis.set_major_locator(plt.NullLocator())
# assign a size for the second subplot
ax2 = fig.add_subplot(gs[1:, :-1])
#MatCount = np.log10(MatCount+1)
# CMAP = 'viridis'
CMAP = 'OrRd' # change the color map
ax2.imshow(MatCount, aspect='auto', interpolation='nearest', cmap=getattr(plt.cm, CMAP), origin='upper')
ax2.set_xticks(CenterTicks)
ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
ax2.tick_params(axis="y", labelsize=10)
ax2.grid(False)
ax2.set_ylabel('Glitch index', fontsize=15)
ax2.set_xlabel('Auxiliary channel', fontsize=15)
###### Make a color bar
cmap = getattr(matplotlib.cm, CMAP)
norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
ax3 = fig.add_subplot(gs[1:, -1])
cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax3.grid(False)
cbar.set_label(r'$q$', size=15)
# cbar.set_label('Importance', size=10)
########
fig.subplots_adjust(hspace=0)
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed Saving a plot of Glitch occurrence VS channel')
#-----------------------------------------------------------------+
# make .csv file comprising GPS, importance of all the channels |
#-----------------------------------------------------------------+
# concatenate a list of GPS times and importance of all the channels
MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1)
# make a list of column names
columnName = np.append(np.array(['GPS']), np.array(['duration']))
columnName = np.append(columnName, ListChannelName)
# convert it to a pandas frame
dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv')
# save it as .csv file
dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
print('Completed saving a table comprising GPS times, durations, and importance of the channels')
return None
[docs] def MakeMatrixOccurrenceVSChannel(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir):
'''
description:
save matrix of GPS, duration, importance as .csv file
USAGE: PlotOccurrenceVSChannel(MatCount, ListChannelName, ListGPS, ListDuration, output_dir, output_file)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListGPS: a list of GPS times
:param ListDuration: a list of durations
:param output_dir:
:return: None
dependencies: CreateChannelTicks()
'''
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
MatCount = MatCount.reshape(1, MatCount.shape[0]) # make row vector
#-----------------------------------------------------------------+
# make .csv file comprising GPS, importance of all the channels |
#-----------------------------------------------------------------+
# concatenate a list of GPS times and importance of all the channels
MatGPSDurationImpotanceChannels = np.concatenate((ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1), MatCount), axis=1)
# make a list of column names
columnName = np.append(np.array(['GPS']), np.array(['duration']))
columnName = np.append(columnName, ListChannelName)
# convert it to a pandas frame
dfGPSDurationImportanceChannels = pd.DataFrame(MatGPSDurationImpotanceChannels, columns=columnName)
path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'GPSDurationImportanceChannels.csv')
# save it as .csv file
dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
print('Completed saving a table comprising GPS times, durations, and importance of the channels')
return None
[docs] def PlotTimeVSChannel(self, MatCount, ListChannelName, ListGPS, output_dir, output_file, startT, endT, dt):
'''
description:
make a plot of glitch times versus channels where the time flow is from top to bottom
If there are more than one glitch in a time bin, those gltiches' values of importance are averaged
If there is no glitches in a time bin, all the values of importance are set to be zero
dependencies: CreateChannelTicks()
USAGE: PlotTimeVSChannel(MatCount, ListChannelName, output_dir, output_file, startT, endT, dt)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListGPS: a list of GPS times
:param output_dir: a path to an output directory
:param output_file: a name of an output file
:param startT: start time of an epoch
:param endT: end time of an epoch
:param dt: step size in sec of the time slice
:return: None
'''
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
print('There is only one single glitch. Skip the plot of glitch index VS channels')
return None
# ################
# ListChannelName = np.array(df.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
# ListGPS = df.GPS
# dt = 60
# MatCount = df.drop(['Unnamed: 0', 'GPS'], axis=1).values
# #######################
TimeSlice = np.arange(0, endT - startT, dt) # time steps from the beginning of the epoch with the step size of dt in
# make a matrix with all the elements equal to zero, the size of this matrix is # of the time slice in row , # of channels in column
TimeSlice_MatCount = np.zeros((TimeSlice.shape[0], MatCount.shape[-1]))
# convert the GPS times of the sample to the time since the beginning of the epoch
Time_sample = ListGPS - startT
# fill the values of the importance of the sample into the time slices
for ind in range(len(TimeSlice)): # iterate though the time slice
# find the index of the samples that are within this time slice
index_within = np.where((Time_sample >= TimeSlice[ind]) & (Time_sample < TimeSlice[ind] + dt))[0]
if index_within.size > 0: # if more than one sample is in this time slice
# take the average of the values of importance of the samples in this time slice, channel by channel
TimeSlice_MatCount[ind, :] = MatCount[index_within[0]:index_within[-1]+1, :].mean(axis=0)
else: # if no samples is in this time slice, go to the next iteration
pass
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
fig = plt.figure()
# make grid for subplots
gs = GridSpec(9, 30)
# a subplot showing channel position
ax1 = fig.add_subplot(gs[:1, :-1]) # First row, up to the second last column
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1]: # adjust the right edge of the last color span
edgeRight = edgeRight + 1
ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
# limit the xaxis of the first subplot
ax1.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001)
# get rid of ticks of the first plot
ax1.yaxis.set_major_locator(plt.NullLocator())
ax1.xaxis.set_major_locator(plt.NullLocator())
# assign a size for the second subplot
ax2 = fig.add_subplot(gs[1:, :-1])
#MatCount = np.log10(MatCount+1)
ax2.imshow(TimeSlice_MatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
ax2.set_xticks(CenterTicks)
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
ax2.tick_params(axis="y", labelsize=10)
ax2.grid(False)
ax2.axis(xmin=0, xmax=TimeSlice_MatCount.shape[-1] * 1.001)
start_epoch_UTC = tconvert(startT) # make gwpy object of UTC time
# set y label
if dt == 60:
unit_time = 'mins'
elif dt == 3600:
unit_time = 'hours'
elif dt == 1:
unit_time = 'secs'
ax2.set_ylabel('Time [{0}] since {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(unit_time, start_epoch_UTC.year, start_epoch_UTC.month, start_epoch_UTC.day, start_epoch_UTC.hour, start_epoch_UTC.minute, start_epoch_UTC.second), fontsize=15)
ax2.set_xlabel('Auxiliary channel', fontsize=15)
###### Make a color bar
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=TimeSlice_MatCount.min(), vmax=TimeSlice_MatCount.max())
ax3 = fig.add_subplot(gs[1:, -1])
cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax3.grid(False)
cbar.set_label('Importance', size=10)
########
fig.subplots_adjust(hspace=0)
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed Saving a plot of time VS channel')
return None
[docs] def PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file):
'''
description:
make a plot of SNR of h(t) versus channels
dependencies: CreateChannelTicks()
USAGE: PlotSNRVSChannel(self, MatCount, ListChannelName, ListSNR, output_dir, output_file)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListSNR: a list SNRs
:param output_dir:
:param output_file:
:return: None
'''
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
print('There is only one single glitch. Skip the plot of SNR VS channels')
return None
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
fig = plt.figure()
# make grid for subplots
gs = GridSpec(9, 30)
# a subplot showing channel position
ax1 = fig.add_subplot(gs[:1, :-1])
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
edgeRight = edgeRight + 1
ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
# limit the xaxis of the first subplot
ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
# get rid of ticks of the first plot
ax1.yaxis.set_major_locator(plt.NullLocator())
ax1.xaxis.set_major_locator(plt.NullLocator())
# assign a size for the second subplot
ax2 = fig.add_subplot(gs[1:, :-1]) # First row, first column
# sort based on SNR values
MatCountSortedSNR = np.array([MatCount[i, :] for i in np.argsort(ListSNR)[::-1]])
#MatCountSortedSNR = np.log10(MatCountSortedSNR+1)
ax2.imshow(MatCountSortedSNR, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
ax2.set_xticks(CenterTicks)
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
ax2.tick_params(axis="y", labelsize=10)
ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
ax2.grid(False)
fig.subplots_adjust(hspace=0)
y_ticks = np.sort(ListSNR)[::-1] # descending values of SNR
# take dominant dicks, e.g, when the first digit changes
ListYticksNew = [0]
ListYTicksOld = []
SNR_range = y_ticks.max() - y_ticks.min() # the range of SNR
if SNR_range > 1000: # if the range of SNR is large (e.g., 100)
RoundPoint = -2 # round up at 100th
elif SNR_range > 100:
RoundPoint = -1 # round up at 10th
else:
RoundPoint = 0 # round up at 1th
for ind, i in enumerate(np.round(y_ticks, RoundPoint), 0): # iterate over rounded numbers of SNR
if ListYticksNew[-1] != i: # if difference changes compared with the previous value
ListYticksNew.append(i)
ListYTicksOld.append(ind)
ListYticksNew = ListYticksNew[1:]
ax2.set_yticks(ListYTicksOld)
ax2.set_yticklabels(ListYticksNew)
ax2.set_ylabel('SNR of h(t)', fontsize=15)
ax2.set_xlabel('Auxiliary channel', fontsize=15)
###### Make a color bar
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
ax3 = fig.add_subplot(gs[1:, -1])
cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax3.grid(False)
cbar.set_label('Importance', size=10)
########
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed Saving a plot of SNR VS channel')
return None
[docs] def PlotConfidenceVSChannel(self, MatCount, ListChannelName, ListConf, output_dir, output_file):
'''
make a plot of values of confidence level of Gravity Spy versus channels
dependencies: CreateChannelTicks()
USAGE: PlotConfidenceVSChannel(MatCount, ListChannelName, ListConf, output_dir, output_file)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListConf: a list confidence levels
:param output_dir:
:param output_file:
:return: None
'''
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
print('There is only one single glitch. Skip the plot of confidence level VS channels')
return None
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys= self.CreateChannelTicks(ListChannelName)
fig = plt.figure()
# make grid for subplots
gs = GridSpec(9, 30) # 9 rows, 25 columns
# a subplot showing channel position
ax1 = fig.add_subplot(gs[:1, :-1]) # First row, first column
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # adjust the right edge of the last color span
edgeRight = edgeRight + 1
ax1.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
# limit the xaxis of the first subplot
ax1.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
# get rid of ticks of the first plot
ax1.yaxis.set_major_locator(plt.NullLocator())
ax1.xaxis.set_major_locator(plt.NullLocator())
# assign a size for the second subplot
ax2 = fig.add_subplot(gs[1:, :-1]) # First row, first column
# sort based on SNR values
MatCountSortedConf = np.array([MatCount[i, :] for i in np.argsort(ListConf)[::-1]])
#MatCountSortedConf = np.log10(MatCountSortedConf + 1)
ax2.imshow(MatCountSortedConf, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper')
ax2.set_xticks(CenterTicks)
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)])
ax2.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=5, length=0, labelrotation=90, size=0, labelsize=10)
ax2.tick_params(axis="y", labelsize=10)
ax2.grid(False)
ax2.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001)-1+0.5)
fig.subplots_adjust(hspace=0)
y_ticks = np.sort(ListConf)[::-1]
ListYticksNew = [0]
ListYTicksOld = []
# take dominant dicks, e.g, when the first decimal point changes
for ind, i in enumerate(np.round(y_ticks, 1), 0):
if ListYticksNew[-1] != i:
ListYticksNew.append(i)
ListYTicksOld.append(ind)
ListYticksNew = ListYticksNew[1:]
ax2.set_yticks(ListYTicksOld)
ax2.set_yticklabels(ListYticksNew)
ax2.set_ylabel('Confidence level', fontsize=15)
ax2.set_xlabel('Auxiliary channel', fontsize=15)
###### Make a color bar
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
ax3 = fig.add_subplot(gs[1:, -1])
cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax3.grid(False)
cbar.set_label('Importance', size=10)
########
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed Saving a plot of Confidence VS channel')
return None
[docs] def TableImportance(self, MatCount, ListChannelName, output_dir, output_file):
'''
make a table of values of importance as a .csv file
USAGE: TableImportance(MatCount, ListChannelName, output_dir, output_file)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param output_dir:
:param output_file:
:return: None
'''
if MatCount.shape[0] == MatCount.shape[-1]: # for a matrix with one row
MatCount = MatCount.reshape(1, MatCount.shape[-1])
# sum all the counts over glitches
SumMatCount = np.sum(MatCount, axis=0) / float(MatCount.shape[0])
# a list of sorted Channel names & sorted Importance
RankChannel = list(zip(np.argsort(SumMatCount)[::-1], ListChannelName[np.argsort(SumMatCount)[::-1]], SumMatCount[np.argsort(SumMatCount)[::-1]]))
# make pandas matrix
Impdf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'Importance'))
# add rank label
Impdf = Impdf.assign(rank=np.arange(1, MatCount.shape[-1] + 1, 1))
# round importance
Impdf['Importance'] = Impdf['Importance'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Impdf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Impdf = Impdf[cols]
output_path = os.path.join(output_dir, output_file)
# save it
Impdf.to_csv(output_path)
print('Completed saving a table of importance of channels')
return None
[docs] def BinomialDist(self, k, n, likelihood):
'''
description: Binomial distribution
USAGE: out = BinomialDist(self, k, n, likelihood)
:param k: the number of detections
:param n: the number of trials
:param p: likelihood of detection
:return: out: a value of probability density to find k detection out of n trials
'''
#pro_dens = factorial(n) / factorial(k) / factorial(n - k) * likelihood ** k * (1 - likelihood) ** (n - k)
# use the scipy.stats as the equation above fails for a large value of n, or k
pro_dens = binom.pmf(k, n, likelihood) # binomial probability mass function
return pro_dens
[docs] def BinomialTest(self, k, N, rate):
'''
description: compute p-value of one-tailed Binomial test against null rate
USAGE: p_value = BinomialTest(k, N, rate)
:param k: observed numberof successes
:param N: total number of samples
:param rate: rate of successes drawn from a null hypothesis
:return: p_value: probablity of successes equal or greater than the observed number of successes
'''
#---- the manual calculate is much slower than scipy.stats.binom_test so that it is replaced
#list_k = np.linspace(k, N, N - k + 1)
#list_pro = np.vectorize(self.BinomialDist)(list_k, N, rate)
#p_value = list_pro.sum()
#-----------------------------
if k/float(N) >= rate: # one-sided binomial test condition applied
p_value = binom_test(k, N, rate)/2. # use scipy.stats
else: # one-sided binomial test condition applied
p_value = 1 - binom_test(k, N, rate)/2. # use scipy stats
return p_value
[docs] def query_targetglitch_null(self, path_target_glitch_dataset, path_null_dataset):
'''
description:
load files, otherwise, end the program
:param path_target_glitch_dataset: a path to .csv file of a target gltich class
:param path_null_dataset: a path to .csv file of a null dataset
:return:
df1: a pandas dataframe of a target glitch class
df0: a pandas datafram of a null dataset
'''
try:
df1 = pd.read_csv(path_target_glitch_dataset)
except IOError:
print('No such a file {}'.format(path_target_glitch_dataset))
sys.exit()
try:
df0 = pd.read_csv(path_null_dataset)
except IOError:
print('No such a file {}'.format(path_null_dataset))
sys.exit()
return df1, df0
[docs] def Student_t_independet_test(self, data1, data2, Welch_test=True):
'''
Independent Student t-test
USAGE: stat, p_value = self.Student_t_independet_test(data1, data0)
:param data1: a population
:param data2: another population
:return:
t_value: t-value (critical value)
p: p-value
'''
if data1.std(ddof=1) + data2.std(ddof=1) == 0: # if the sum of the variances are zero, t-value will be infinity
t_value = 0 # manually set zero
p_value = 1 # manually set 1
return t_value, p_value
if Welch_test == True:
t_value, p_value = ttest_ind(data1, data2, equal_var=False)
else:
t_value, p_value = ttest_ind(data1, data2)
return t_value, p_value
[docs] def chisquare_test(self, target_mat, null_mat):
'''
description:
calculate the values of chi-square of each channel in each glitch
and put out values of chi-square and corresponding p-values
USAGE: chi2_value, p_value = chisquare_test(self, target_mat, null_mat)
:param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns
:param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns
:return:
chi2_value: a matrix of chi-square values where glitch indices are in rows and channels are in columns
p_value: a matrix of p-values where glitch indices are in rows and channels are in columns
'''
# degrees of freedom is 1 for a single point chi-square test
ddof = 1
mean_null = null_mat.mean(axis=0) # calcualte the mean of null samples channel by channel
std_null = null_mat.std(ddof=ddof,
axis=0) # calculate the standard deviation of the null samples channel by channel
# chi-square values
chsqr_values = ((target_mat - mean_null) / std_null) ** 2
# filter (correct)
# when std_null is zero
# chi-square value will nan when the nominator is zero (it's supposed to be zero)
# chi-square value will be infinite when the nominator is non-zero
where_are_NaNs = np.isnan(chsqr_values) # indices where elements are nan
chsqr_values[where_are_NaNs] = 0 # replace with zero
where_are_inf = np.isposinf(chsqr_values) # indices where elements are positive infinite
chsqr_values[where_are_inf] = 0 # replace with zero (forcefully set even if not correct)
# calculate p-value of the right-sided chi-square test
p_values = 1 - chi2.cdf(chsqr_values, ddof)
return chsqr_values, p_values
[docs] def find_channels(self, df_target):
'''
description:
find a list of channels from the target samples
USAGE: channels = find_channels(df_target)
:param df_target: target samples in pandas frame
:return: channels: a list of channels
'''
try:
channels = np.array(df_target.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist()) # new format of the table
except ValueError:
channels = np.array(df_target.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist()) # old format of the table
return channels
[docs] def Calculate_number_and_rate(self, df_target, df_null, d_c, err_cal, channels):
'''
description:
1. use the target and null samples
2. calculate the numbers of the target and null samples above threshold
3. calculate the fraction of the target and null samples above threshold
USAGE: channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = Calculate_number_and_rate(df_target, df_null, d_c=None, err_cal=False, channels=None)
:param df_target: target samples in pandas frame
:param df_null: null samples in pandas frame
:param d_c: a threshold. if it is None, the threshold is the mean value of null samples
:param err_cal: boolen, wether calculate the error of the statistics or not
:param channels: a list of channels
:return:
channels: a list of channels
list_num1: a list of the number of the target samples above a threshold
list_num0: a list of the number of the null samples above a threshold
total_sample1: the number of the target samples, float
total_sample0: the number of the null samples, float
list_Causal: a list of the statistics
list_causal_err: a list of the errors of the statistics. It is None if err_cal = False
'''
if type(channels) == int: # channels are not provided, it uses all the channels in the target samples
# find a list of channels from the target samples
channels = self.find_channels(df_target)
else:
pass
list_num1 = np.array([]) # a list of a number of samples of target glitches that have values greater than the detection threshold
list_num0 = np.array([]) # a list of a number of samples of dummpy quite dataset that have values greater than the detection threshold
list_Causal = np.array([]) # a list of the causal probability
list_causal_err = np.array([]) # a list of the error of the causal probability
total_sample1 = float(df_target.shape[0]) # total number of target glitches
total_sample0 = float(df_null.shape[0]) # total number of dummy quite samples
for ch in channels: # iterate though channels
# calculate the threshold of claiming detection of a glitch
if d_c == None:
d_c = df_null['{}'.format(ch)].mean()
else:
pass
# calculate the number of samples that have values of importance greater than the detection threshold
num_d1 = df_target[df_target['{}'.format(ch)] > d_c].shape[0]
num_d0 = df_null[df_null['{}'.format(ch)] > d_c].shape[0]
# calculate the fraction samples that have values greater than the detection threshold
rate1 = num_d1 / total_sample1
rate0 = num_d0 / total_sample0
# calculate the probability of the causality
try:
Causal = rate1 / (rate1 + rate0)
except ZeroDivisionError: # if the denominator is zero, manually set it zero (it is ok, this channel should not pass Binomial test)
Causal = 0
if err_cal == True:
# calculate the error of the causal probability
err1 = np.sqrt(rate1 * (1 - rate1) / float(total_sample1)) # the error of true positive based on multinominal distribution
err0 = np.sqrt(rate0 * (1 - rate0) / float(total_sample0)) # the error of false positive based on multinominal distribution
try:
causal_err = (rate1 + rate0) **(-2) * np.sqrt(rate0**2 * err1**2 + rate1**2 + err0**2) # based on the error propagation
except ZeroDivisionError:
causal_err = 0
else:
pass
# fill the values
list_num1 = np.append(list_num1, num_d1)
list_num0 = np.append(list_num0, num_d0)
list_Causal = np.append(list_Causal, Causal)
if err_cal == True:
list_causal_err = np.append(list_causal_err, causal_err)
else:
list_causal_err = None
return channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err
[docs] def find_meaing_ful_confidence(self, df1, df0, BinomialTestConfidence, d_c=None, err_cal=False, channels=0):
'''
description
1. load files of target glitches (df1) and dummy quite data set (df0)
2. compute true positive probability
3. perform one-tailed Binomial test
USAGE: list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, list_channels= find_meaing_ful_confidence(df1, df0, BinomialTestConfidence, d_c)
:param df1: target glitches in the pandas format
:param df0: null dataset in the pandas format
:param BinomialTestConfidence: a confidence level used for one-tailed Binomial test
:param d_c: user defined threshold to claim detection of a glitch, None in default
:param channels: a list of channels, 0 in default
if d_c is None, values of threshold is given by the mean value of importance generated by the dummy quite dataset
:return:
list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, otherwise, zero
list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, otherwise, zero
list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero
list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero
list_Test: a list of results of the Binomial test, 'pass' or 'fail'
channels: a list of channel names
'''
channels, list_num1, list_num0, total_sample1, total_sample0, list_Causal, list_causal_err = self.Calculate_number_and_rate(df1, df0, d_c=d_c, err_cal=err_cal, channels=channels)
list_rate0 = list_num0 / total_sample0 # a list of the fractions of the null samples above the threshold
list_Causal_passed = np.zeros(list_Causal.shape[0]) # initial values are zero
if err_cal == True: # if the error of the causal probability is calculated
list_causal_passed_err = np.zeros(list_causal_err.shape[0]) # initial values are zero
else: # other wise, it is None
list_causal_passed_err = None
list_Test = np.array([])
# perform Binomial test
for index in range(len(channels)): # iterate though the index of channels
num1 = list_num1[index] # the number of times when this channel shows values greater than the detection threshold
rate0 = list_rate0[index] # the fraction of samples of the dummy quite dataset shows values greater than the detection threshold
# compute p-values of one-tailed Binomial test against null hypothesis
p_value = self.BinomialTest(num1, total_sample1, rate0)
if p_value < (1 - BinomialTestConfidence): # if pass {BinomialTestConfidence}% confidence level
list_Causal_passed[index] = list_Causal[index] # fill the probability of the causality. If a channel passes the test, the causal probability is non-zero value, otherwise, zero. This process is needed for the plottting
if err_cal == True:
list_causal_passed_err[index] = list_causal_err[index] # fill the error of the probability of the causality.
else:
pass
list_Test = np.append(list_Test, 'pass') # fill the mark of pass
else:
list_Test = np.append(list_Test, 'fail') # fill the mark of fail
list_Causal_fail = list_Causal - list_Causal_passed # a list of the probability of the causality that failed the one-tailed Binomial test
if err_cal == True: # if the error of the causal probability is calculated
list_causal_failed_err = list_causal_err - list_causal_passed_err # a list of the error of the probability of the causality that failed the one-tailed Binomial test
else: # otherwise, it is None
list_causal_failed_err = None
return list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, channels
[docs] def PlotCausalityVSChannel(self, list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands):
'''
description:
Calculate the probabilities of the causality of channels
USAGE: PlotCausalityVSChannel(list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence)
:param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero
:param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero
:param list_causal_passed_err: a list of the error of the causal probability that are passed one-tailed Binomial test, otherwise, zero
:param list_causal_failed_err: a list of the error of the causal probability that are failed one-tailed Binomial test, otherwise, zero
:param list_Test: a list of results of the Binomial test, 'pass' or 'fail'
:param ListChannelName: a list of channel names
:param output_dir: (only used for all glitches)
:param output_file: (only used for all glitches)
:param BinomialTestConfidence: binomial test confidence level
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_Causal_passed.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# ++++++++ plot bar plot about the probability of the causality that pass or fail the Binomial test and ++++++++++++++++
#plt.bar(np.arange(list_Causal_passed.shape[-1]), list_Causal_passed, 1, yerr=list_causal_passed_err, align='center', edgecolor='blue', color='blue', linewidth=0.)
#plt.bar(np.arange(list_Causal_fail.shape[-1]), list_Causal_fail, 1, yerr=list_causal_failed_err, align='center', edgecolor='red', alpha=0.3, color='red', linewidth=0.)
plt.bar(np.where(list_Causal_fail != 0)[0], list_Causal_fail[list_Causal_fail != 0], 1, align='edge', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
plt.bar(np.where(list_Causal_passed != 0)[0], list_Causal_passed[list_Causal_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# a list of the probability of the causality
list_Causal = list_Causal_passed + list_Causal_fail
# list of the error of the probability of the causality
# list_causal_err = list_causal_passed_err + list_causal_failed_err
# sort channel based on the probability of the causality
if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
# define the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_Causal_passed.shape[-1] + 1, 1))
# round importance
Prodf['WRS'] = Prodf['WRS'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
# table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=[0.1, 0.6, 0.2])
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# set axis label
plt.ylabel('Witness ratio statistic', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 0.8])
# save a plot
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of the witness ratio statisti VS channel')
return None
[docs] def TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file, BinomialTestConfidence, freq_bands=Const.freq_bands):
'''
make a table of witness ratio statistic (WRS) of channels as a .csv file
USAGE: TableCausality(self, list_Causal_passed, list_Causal_fail, list_Test, ListChannelName, output_dir, output_file)
:param list_Causal_passed: a list of the probability of the causality that are passed one-tailed Binomial test, if not zero
:param list_Causal_fail: a list of the probability of the causality that are failed one-tailed Binomial test, if not zero
:param list_Test: a list of results of the Binomial test, 'pass' or 'fail'
:param ListChannelName: a list of channel names
:param output_dir:
:param output_file:
:param BinomialTestConfidence: binomial test confidence level
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
# a list of the probabilit of the causality
list_Causal = list_Causal_passed + list_Causal_fail
if ListChannelName[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), ListChannelName.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
ListChannelName = np.array([ChannelName.rsplit("_BP", 1)[0] for ChannelName in ListChannelName])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
RankChannel = list(zip(ListChannelName[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_Causal, list_Test)
RankChannel = list(zip(list_sorted_base_index_pass_fail, ListChannelName[list_sorted_base_index_pass_fail], list_Causal[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'WRS', 'Test{}'.format(BinomialTestConfidence)))
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_Causal.shape[-1] + 1, 1))
# round importance
Prodf['WRS'] = Prodf['WRS'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
output_path = os.path.join(output_dir, output_file)
# save it
Prodf.to_csv(output_path, index=False)
print('Completed saving a table of the witness ratio statistic of channels')
return None
# def calculate_FAP(self, df1, df0, mode):
# '''
# description:
# 1. calculate false alarm probability (FAP)
# USAGE: df_FAP = calculate_FAP(df1, df0, mode='indiv')
#
# :param df1: a target smaples in pandas frame
# :param df0: a null samples in pandas frame
# :param mode: a mode to calculate FAP. 'whole' calculates FAP of the average value of importance of glitches 'indiv' calculates FAP of each glitch in each channel
# :return:
# if mode == 'whole':
# list_FAP: numpy array of FAPs of channels
# list_FAP_err: numpy array of errors of FAP of channels
# channels: a list of channels
# USAGE: list_FAP, list_FAP_err = calculate_FAP(df1, df0, mode='whole')
# elif mode == 'indiv':
# df_FAP: a matrix of FAP of each glitch in each channel in pandas frame
# channels: a list of channels
# '''
# try:
# channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist())
# except ValueError:
# channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
# if mode == 'whole': # calculate false alarm probablity (FAP) of the average value of importance of target glitches
# list_FAP = np.array([])
# list_FAP_err = np.array([])
# total_sample0 = df0.shape[0] # total number of dummy quite samples
# for ch in channels: # iterate though channels
# # calculate the threshold of claiming detection of a glitch
# d_c = df1['{}'.format(ch)].mean()
# # calculate the number of samples that have values of importance greater than the detection threshold
# num_d0 = df0[df0['{}'.format(ch)] > d_c].shape[0]
# # calculate the fraction samples that have values greater than the threshold, FAP
# FAP = num_d0 / float(total_sample0)
# # calculate the probability of the causality
# FAP_err = np.sqrt(FAP * (1 - FAP) / float(total_sample0)) # the error of false positive based on multinominal distribution
# # fill the values
# list_FAP = np.append(list_FAP, FAP)
# list_FAP_err = np.append(list_FAP_err, FAP_err)
# return list_FAP, list_FAP_err, channels
# if mode == 'indiv':
# df_FAP = pd.DataFrame()
# total_sample0 = float(df0.shape[0]) # total number of dummy quite samples
# for ch in channels: # iterate though channels
# # calculate FAP of each glitch for this channel
# # note that d_c is measured importance of this glitch in this channel
# # note that [df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 is a fraction of null samples above d_c, which is FAP
# FAP = np.array([df0[df0['{}'.format(ch)] > d_c].shape[0]/total_sample0 for d_c in df1['{}'.format(ch)]])
# df_FAP['{}'.format(ch)] = FAP
# return df_FAP, channels
[docs] def calculate_fap(self, target_mat, null_mat):
'''
description:
calculate the values of fap of each channel in each glitch
USAGE: faps = calculate_fap(target_mat, null_mat)
:param target_mat: a matrix of target samples where glitch indices are in rows and channels are in columns
:param null_mat: a matrix of null samples where glitch indices are in rows and channels are in columns
:return: fap
'''
list_mean_null = null_mat.mean(axis=0) # calcualte the mean of null samples channel by channel
list_var_null = null_mat.var(ddof=1, axis=0) # calculate the standard deviation of the null samples channel by channel
list_zero_mean_null_index = np.where(list_mean_null == 0)[0] # find the indecies where the mean of the null equal to zero
list_zero_var_null_index = np.where(list_var_null == 0)[0] # find the indecies where the variance of the null equal to zero
list_both_zero_mean_var_null_index = np.intersect1d(list_zero_mean_null_index, list_zero_var_null_index)
list_a_null = np.array([mean_null ** 2 * (1. - mean_null) / var_null - mean_null if mean_null != 0 and var_null != 0 else -1 for mean_null, var_null in zip(list_mean_null, list_var_null)]) # the parameter of beta dist
list_b_null = np.array([a_null * (1 - mean_null) / mean_null if mean_null != 0 else -1 for a_null, mean_null in zip(list_a_null, list_mean_null)]) # the parameter of beta dist
# if the mean and the variance of the null are both zero, the parameters are zero
# the null distribution is a delta function at zero
# then, the FAP will be NaN
# to fix this issue, I manually set the beta distribution which are steap at zero
list_a_null[list_both_zero_mean_var_null_index] = 0.01
list_b_null[list_both_zero_mean_var_null_index] = 100
# when mean_null - mean_null **2 - var_null is negative a_null will be negative, which is not allowed in beta distribution
# but in this case, mean_null and var_null are both very small value, but there are a few samples with values of importance are large
# This means it looks like superposition of two delta functions at 0 and 1
# so I set a_null = 0.01 manually
# the re-calculate b_null
list_index_negative_a_null = np.where(list_a_null < 0)[0]
list_a_null[list_index_negative_a_null] = 0.01
list_b_null[list_index_negative_a_null] = list_a_null[list_index_negative_a_null] * (1 - list_mean_null[list_index_negative_a_null]) / list_mean_null[list_index_negative_a_null]
rv_null = beta(list_a_null, list_b_null)
faps = 1 - rv_null.cdf(target_mat)
return faps
[docs] def calculate_reweighted_importance(self, df1, df_FAP):
'''
description:
reweight the values of importance using FAP
the reweighted importance is defined as rho_new = rho / (FAP + 1)
USAGE: df1_new = calculate_reweighted_importance(df1, df_FAP)
:param df1: a target samples in pandas frame
:param df_FAP: a FAP matrix in pandas frame
:return: df1_new: reweighted importance matrix in pandas frame
'''
try:
channels = np.array(df1.drop(['Unnamed: 0', 'GPS', 'duration'], axis=1).columns.tolist())
except ValueError:
channels = np.array(df1.drop(['Unnamed: 0', 'GPS'], axis=1).columns.tolist())
mat1 = df1[channels].values
mat_FAP = df_FAP[channels].values
mat1_new = mat1 / (mat_FAP + 1)
df1_reweighed = pd.DataFrame(mat1_new, columns=channels)
df1_new = df1.drop(channels, axis=1)
df1_new = pd.concat([df1_new, df1_reweighed], axis=1)
return df1_new
[docs] def ranking_channels(self, list_ranking_statistic, list_Test):
'''
description:
1. sort based on the value of the ranking statistic
2. sort based on the test with "pass" and "fail", where "pass" comes before "fail"
3. findthe indecies based on the sort 1) and 2)
USAGE: list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail = ranking_channels(list_ranking_statistic, list_Test)
:param list_ranking_statistic:
:param list_Test:
:return:
'''
list_original_indicies = np.arange(0, list_Test.shape[0], 1).astype('int')
list_sorted_statistic_indices = np.argsort(list_ranking_statistic)[::-1] # descending order
# sort the original indices
list_sorted_original_indicies = list_original_indicies[list_sorted_statistic_indices]
# sort the ranking statistic
list_sorted_ranking_statistic = list_ranking_statistic[list_sorted_statistic_indices]
# sort the list of Test
list_sorted_Test = list_Test[list_sorted_statistic_indices]
# find the indices where the pass comes first and the fail comes later
list_pass_indices = np.where(list_sorted_Test == 'pass')[0]
list_fail_indicies = np.where(list_sorted_Test == 'fail')[0]
list_indices_sorted_pass_fail = np.append(list_pass_indices, list_fail_indicies)
# sort the original indices
list_sorted_base_index_pass_fail = list_sorted_original_indicies[list_indices_sorted_pass_fail]
# sort the ranking statistic
list_sorted_ranking_statistic_pass_fail = list_sorted_ranking_statistic[list_indices_sorted_pass_fail]
# sort the list of Test
list_sorted_Test_pass_fail = list_sorted_Test[list_indices_sorted_pass_fail]
# assign rank
# ...
return list_sorted_base_index_pass_fail, list_sorted_ranking_statistic_pass_fail, list_sorted_Test_pass_fail
[docs] def Plot_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
'''
description:
plot the result of one-sided Welch t-test
USAGE: Plot_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file)
:param channels: a list of channels
:param list_t_values_passed: a list of t-values that pass the test
:param list_t_values_failed: a list of t-values that fail the test
:param list_Test: a list of the test results {'pass', 'fail'}
:param confidence_level: a confidence level
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_t_values_passed.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
plt.bar(np.where(list_t_values_failed != 0)[0], list_t_values_failed[list_t_values_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
plt.bar(np.where(list_t_values_passed != 0)[0], list_t_values_passed[list_t_values_passed != 0], 1, align='edge', edgecolor=None, color='blue', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# a list of the probability of the causality
list_t_values = list_t_values_passed + list_t_values_failed
# list of the error of the probability of the causality
# list_causal_err = list_causal_passed_err + list_causal_failed_err
# sort channel based on the probability of the causality
if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 't-value', 'Test{}'.format(confidence_level)))
# defind the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
# round importance
Prodf['t-value'] = Prodf['t-value'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# set axis label
plt.ylabel('t-value', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 0.8])
# save a plot
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of t-value VS channel')
return None
[docs] def Table_Welch_t_test(self, channels, list_t_values_passed, list_t_values_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
'''
description:
save the result of one-sided Welch t-test as a .csv file
USAGE: Table_Welch_t_test(channels, list_t_values_passed, list_t_values_failed, list_Test, output_dir, output_file)
:param channels: a list of channels
:param list_t_values_passed: a list of t-values that pass the test
:param list_t_values_failed: a list of t-values that fail the test
:param list_Test: a list of the test results {'pass', 'fail'}
:param confidence_level: a confidence level
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
# a list of the t-values
list_t_values = list_t_values_passed + list_t_values_failed
if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]','t-value', 'Test{}'.format(confidence_level)))
# define the table size
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 't-value', 'Test{}'.format(confidence_level)))
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
# round importance
Prodf['t-value'] = Prodf['t-value'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
output_path = os.path.join(output_dir, output_file)
# save it
Prodf.to_csv(output_path, index=False)
print('Completed saving a table of the t-value of channels')
return None
################
[docs] def Plot_point_chisqur_test(self, channels_band, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, p_values, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
'''
description:
plot the result of point chi-square test
USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)
:param channels_band: a list of channels in numpy array
:param list_GPS: a list of GPS times in numpy array
:param list_duration: a list of durations in numpy array
:param mat_chsqr_passed: a matrix of "passed" chi-square values where glitch indices are in rows and channels are columns in numpy array
note that channels in glitches that passed the test have non-zero values, otherwise zero
:param mat_chsqr_failed: a matrix of "failed" chi-square values where glitch indices are in rows and channels are columns in numpy array
note that channels in glitches that failed the test have non-zero values, otherwise zero
:param mat_Test: a list of the test results {'pass', 'fail'}
:param p_values: a matrix of p-values where glitch indices are in rows and channels are columns in numpy array
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
for GPS, duration, list_chsqr_passed, list_chsqr_failed, list_Test in zip(list_GPS, list_duration, mat_chsqr_passed, mat_chsqr_failed, mat_Test):
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_chsqr_passed.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
plt.bar(np.where(list_chsqr_failed != 0)[0], list_chsqr_failed[list_chsqr_failed != 0], 1, align='center', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
plt.bar(np.where(list_chsqr_passed != 0)[0], list_chsqr_passed[list_chsqr_passed != 0], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# a list of values of chi-square
list_chsqr = list_chsqr_passed + list_chsqr_failed
# sort channel based on the probability of the causality
if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'chi-square', 'Test{}'.format(confidence_level)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_chsqr, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_chsqr[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'chi-square', 'Test{}'.format(confidence_level)))
# defind the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_chsqr.shape[-1] + 1, 1))
# round importance
Prodf['chi-square'] = Prodf['chi-square'].astype(float).round(3) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# make title
plt.text(channels.shape[0]*0.5, (list_chsqr_passed + list_chsqr_failed).max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
plt.ylabel('Chi-square', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 1])
# save a plot
if output_file == None:
output_path = os.path.join(output_dir, '{}_chisqr.pdf'.format(str(GPS).replace('.', '-')))
else:
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of chi-square VS channel of a glitch at {}'.format(GPS))
#------------------+
# save the table |
#------------------+
# for chi-squares
mat_chsqr = mat_chsqr_passed + mat_chsqr_failed
data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
data_chisqr = np.hstack((data_GPSduration, mat_chsqr))
columns_chisqr = np.append(np.array(["GPS", "duration"]), channels_band)
df_chisqr = pd.DataFrame(data_chisqr, columns=columns_chisqr)
df_chisqr.to_csv(os.path.join(output_dir, 'chisqr.csv'))
print("chi-square table is saved")
# for p-values
data_pvalue = np.hstack((data_GPSduration, p_values))
df_chisqr = pd.DataFrame(data_pvalue, columns=columns_chisqr)
df_chisqr.to_csv(os.path.join(output_dir, 'p_values.csv'))
print("p-value table is saved")
return None
[docs] def Plot_fap(self, channels_band, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
'''
description:
plot the result of point chi-square test
USAGE: Plot_point_chisqur_test(channels, list_GPS, list_duration, mat_fap, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)
:param channels_band: a list of channels in numpy array
:param list_GPS: a list of GPS times in numpy array
:param list_duration: a list of durations in numpy array
:param mat_fap: matrix of fap
:param mat_Test: a list of the test results {'pass', 'fail'}
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
for GPS, duration, list_fap, list_Test in zip(list_GPS, list_duration, mat_fap, mat_Test):
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_fap.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# ++++++++ plot bar plot about - log10(fap) ++++++++++++++++
list_fap_for_plot = list_fap
# for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap
list_fap_for_plot[list_fap_for_plot == 0] = mat_fap[mat_fap != 0].min()
plt.bar(np.where(list_fap_for_plot >= (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot >= (1 - confidence_level)], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
plt.bar(np.where(list_fap_for_plot < (1 - confidence_level))[0], 1 / list_fap_for_plot[list_fap_for_plot < (1 - confidence_level)], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# # a list of values of fap
# list_fap = faps
# sort channel based on the probability of the causality
if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'FAP', 'Test{}'.format(confidence_level)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(-np.log10(list_fap), list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_fap[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'FAP', 'Test{}'.format(confidence_level)))
# defind the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_fap.shape[-1] + 1, 1))
# round importance
Prodf['FAP'] = ['{:.2e}'.format(fap) for fap in Prodf['FAP'].astype(float)]
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# make title
plt.text(channels.shape[0]*0.5, (1/list_fap_for_plot).max()**1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
plt.ylim(ymin=1)
plt.ylabel('1/FAP', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.yscale('log')
#plt.tight_layout(rect=[0, 0.03, 1, 0.8])
plt.tight_layout(rect=[0, 0.03, 1, 1])
# save a plot
if output_file == None:
output_path = os.path.join(output_dir, '{}_fap.pdf'.format(str(GPS).replace('.', '-')))
else:
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of FAP VS channel of a glitch at {}'.format(GPS))
return None
[docs] def Save_fap_csv(self, channels, list_GPS, list_duration, mat_fap, output_dir):
'''
description:
save the FAP table as csv file
USAGE: Save_fap_csv(channels, list_GPS, list_duration, mat_fap, output_dir)
:param channels: a list of channels
:param list_GPS: a list of GPS times
:param list_duration: a list of durations
:param mat_fap: a matrix of FAP of each channel for each glitch
:param output_dir: output directory
:return: None
'''
#------------------+
# save the table |
#------------------+
# for fap
data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
data_fap = np.hstack((data_GPSduration, mat_fap))
columns_fap = np.append(np.array(["GPS", "duration"]), channels)
df_fap = pd.DataFrame(data_fap, columns=columns_fap)
df_fap.to_csv(os.path.join(output_dir, 'fap.csv'))
print("FAP table is saved")
return None
[docs] def Plot_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
'''
description:
plot the result of p_greater
USAGE: Plot_p_greater(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file)
:param channels: a list of channels
:param list_p_greater_passed: a list of p_greater above confidence level
:param list_p_greater_failed: a list of p_greater that fail the test
:param list_Test: a list of the test results {'pass', 'fail'}
:param confidence_level: a confidence level
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_p_greater_passed.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# ++++++++ plot bar plot about t-values that pass or fail Welch t-test ++++++++++++++++
plt.bar(np.where(list_p_greater_failed != 0)[0], list_p_greater_failed[list_p_greater_failed != 0], 1, align='edge', edgecolor=None, color='lightblue',alpha=0.3, linewidth=0.)
plt.bar(np.where(list_p_greater_passed != 0)[0], list_p_greater_passed[list_p_greater_passed != 0], 1, align='edge', edgecolor=None, color='red', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# a list of the probability of the causality
list_t_values = list_p_greater_passed + list_p_greater_failed
# list of the error of the probability of the causality
# list_causal_err = list_causal_passed_err + list_causal_failed_err
# sort channel based on the probability of the causality
# name of the probablity
name_p_greater = r'$p_g$'
if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', name_p_greater, 'Threshold {}'.format(confidence_level)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', name_p_greater, 'Threshold {}'.format(confidence_level)))
# defind the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
# round importance
Prodf[name_p_greater] = Prodf[name_p_greater].astype(float).round(5) # = df.value1.round()
#Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(5) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# set axis label
#plt.ylabel(r'p_greater', fontsize=15)
plt.ylabel(r'$p_g$', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
plt.tight_layout(rect=[0, 0.03, 1, 1])
# save a plot
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of p_greater VS channel')
return None
[docs] def Table_p_greater(self, channels, list_p_greater_passed, list_p_greater_failed, list_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands):
'''
description:
save the result of p_greater as a .csv file
USAGE: Table_Welch_t_test(channels, list_p_greater_passed, list_p_greater_failed, list_Test, output_dir, output_file)
:param channels: a list of channels
:param list_p_greater_passed: a list of p_greater that pass the test
:param list_p_greater_failed: a list of p_greater that fail the test
:param list_Test: a list of the test results {'pass', 'fail'}
:param confidence_level: a confidence level
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
# a list of the t-values
list_t_values = list_p_greater_passed + list_p_greater_failed
if channels[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1).flatten(order='F')
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_greater', 'Test{}'.format(confidence_level)))
# define the table size
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_t_values, list_Test)
RankChannel = list(zip(list_sorted_base_index_pass_fail, channels[list_sorted_base_index_pass_fail], list_t_values[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('label', 'Channel', 'p_greater', 'Test{}'.format(confidence_level)))
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_t_values.shape[-1] + 1, 1))
# round importance
Prodf['p_greater'] = Prodf['p_greater'].astype(float).round(6) # = df.value1.round()
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
output_path = os.path.join(output_dir, output_file)
# save it
Prodf.to_csv(output_path, index=False)
print('Completed saving a table of the p_greater of channels')
return None
[docs] def Save_p_belong_csv(self, channels_passed, list_GPS, list_duration, mat_p_belong, output_dir):
'''
description:
save the p_belong table as csv file
channels_passed is the list of channels whose p_greater is above 0.5 to avoid misinterpretation of p_belong.
USAGE: Save_p_belong_csv(channels_passed, list_GPS, list_duration, mat_p_belong, output_dir)
:param channels_passed: a list of channels
:param list_GPS: a list of GPS times
:param list_duration: a list of durations
:param mat_p_belong: a matrix of p_belong of each channel for each glitch
:param output_dir: output directory
:return: None
'''
#------------------+
# save the table |
#------------------+
# for fap
data_GPSduration = np.hstack((list_GPS.reshape(list_GPS.shape[0], 1), list_duration.reshape(list_duration.shape[0], 1)))
data_p_belong = np.hstack((data_GPSduration, mat_p_belong))
columns_p_belong = np.append(np.array(["GPS", "duration"]), channels_passed)
df_p_belong = pd.DataFrame(data_p_belong, columns=columns_p_belong)
df_p_belong.to_csv(os.path.join(output_dir, 'p_belong.csv'))
print("p_belong table is saved")
return None
[docs] def Plot_p_belong(self, channels_band, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file=None, freq_bands=Const.freq_bands):
'''
description:
plot the result of p_belong
USAGE: Plot_p_belong(channels, list_GPS, list_duration, mat_p_belong, mat_Test, confidence_level, output_dir, output_file, freq_bands=Const.freq_bands)
:param channels: a list of channels in numpy array
:param list_GPS: a list of GPS times in numpy array
:param list_duration: a list of durations in numpy array
:param mat_p_belong: matrix of p_belong
:param mat_Test: a list of the test results {'pass', 'fail'}
:param output_dir: an output directory
:param output_file: an output file name
:param freq_bands: frequency bands used for the multi-frequency band search, which is defined in const.py
:return: None
'''
for GPS, duration, list_p_belong, list_Test in zip(list_GPS, list_duration, mat_p_belong, mat_Test):
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(channels_band)
# color the background either with black or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == list_p_belong.shape[-1] - 1: # fix the background color to match plot size
edgeRight = edgeRight + 1
ax = plt.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax.set_edgecolor(None)
# for the plottting perpose, 1 / fap when fap = 0 will be infinit so that I replace the fap = 0 to be the maximum fap in the mat_fap
plt.bar(np.where(list_p_belong <= confidence_level)[0], list_p_belong[list_p_belong <= confidence_level], 1, align='center', edgecolor=None, color='lightblue', alpha=0.3, linewidth=0.)
plt.bar(np.where(list_p_belong > confidence_level)[0], list_p_belong[list_p_belong > confidence_level], 1, align='center', edgecolor=None, color='blue', linewidth=0.)
# +++++++++++ plot a table of ranking channels +++++++++++++++++++
# # a list of values of fap
# list_fap = faps
# sort channel based on the probability of the causality
if channels_band[0][-3:-1] == "BP": # if the multi-frequency band search is chosen, the channel name is modified such that is has its name + BP#
# make the lisk of frequency band, if no-frequency bnad limit is not specified (None-None), it is replaced with "All"
list_freq_bands = np.array(["{}-{}".format(lower, upper) if lower != 'None' else "All" for lower, upper in freq_bands])
# As a list of the modified channel name is structure in Channel A in band 0, Channel A in band 1, .., Channel B in band 0, ...
# A list of the corresponding frequency bands are created
# Note that np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels.shape[0], axis=1) is [[band 0, band 0, band 0, band 0, ..., band 0], [band 1, band 1, band 1, ...,, ], ..., [band 7, band 7], ...]
# Note flatten(order='F') flattened the array such that [band 0 , band 1, ..., band 7, band 0, band 1, ..., band 7], ...
#list_freq_bands = np.repeat(list_freq_bands.reshape(list_freq_bands.shape[0],1), channels_band.shape[0], axis=1).flatten(order='F')
# Because for plotting p_belong, only channels that passed p_greater are saved in the matrix
list_freq_bands = np.array([list_freq_bands[int(ch.split('BP')[-1])] for ch in channels_band])
# get rid of the BD# at the end of the channel name to make it original name
channels = np.array([channel.rsplit("_BP", 1)[0] for channel in channels_band])
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test)
RankChannel = list(zip(channels[list_sorted_base_index_pass_fail], list_freq_bands[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'Band [Hz]', 'p_belong', 'Test{}'.format(confidence_level)))
# define the table size
ColWidths =[0.05, 0.6, 0.1, 0.1, 0.15]
else:
# sort the links based on values of statistic
list_sorted_base_index_pass_fail, _, _ = self.ranking_channels(list_p_belong, list_Test)
RankChannel = list(zip(channels_band[list_sorted_base_index_pass_fail], list_p_belong[list_sorted_base_index_pass_fail], list_Test[list_sorted_base_index_pass_fail]))
# make pandas matrix
Prodf = pd.DataFrame(np.array(RankChannel), columns=('Channel', 'p_belong', 'Test{}'.format(confidence_level)))
# defind the table size
ColWidths = [0.05, 0.6, 0.1, 0.15]
# add rank label
Prodf = Prodf.assign(rank=np.arange(1, list_p_belong.shape[-1] + 1, 1))
# round importance
Prodf['p_belong'] = ['{:.4f}'.format(fap) for fap in Prodf['p_belong'].astype(float)]
# re-sort columns to make rank labels coming first
cols = Prodf.columns.tolist()
cols = cols[-1:] + cols[:-1]
Prodf = Prodf[cols]
# take the first 5 high ranking channels
dcsummary = Prodf.iloc[:5]
# plot table on the top of the bar plot
table = plt.table(cellText=dcsummary.values, colLabels=dcsummary.columns, cellLoc='center', loc='top', colWidths=ColWidths)
table.auto_set_font_size(False)
table.set_fontsize(6)
# ++++++ set labels, etc ++++++++++++++++++++++++++++++++++++++
# ++++++++++++++ alter ticks +++++++++++++++++
# plot xticks using the dominant sub sensor names
plt.xticks(CenterTicks, ListSubsys[ListInd.astype(int)])
# put ticks above
plt.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-25, length=0, size=0, labelrotation=90, labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.grid(False)
plt.xlim(0, ListSubsys.shape[0])
# make title
plt.text(channels.shape[0]*0.5, list_p_belong.max()*1.4, 'GPS = {GPS:.02f}, duration = {duration:.3f} s'.format(GPS=GPS, duration=duration), ha='center', va='center', fontsize=10)
plt.ylabel('p_belong', fontsize=15)
plt.xlabel('Auxiliary channel', fontsize=15)
#plt.tight_layout(rect=[0, 0.03, 1, 0.8])
plt.tight_layout(rect=[0, 0.03, 1, 1])
# save a plot
if output_file == None:
output_path = os.path.join(output_dir, '{}_p_belong.pdf'.format(str(GPS).replace('.', '-')))
else:
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
print('Completed Saving a plot of p_belong VS channel of a glitch at {}'.format(GPS))
return None
###################
# def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence):
# '''
# description:
# 1. query the clustered target samples
# 2. query null samples
# 3. perform one-sided binomial test and one-sided Welch t-test on each subclass
# 4. count the number of subclasses which has at least one channel passing the both tests
# USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence)
#
# :param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples
# :param Path_Null_Dataset: a path to null samples
# :param test_confidence: a statistical confidence level
# :return: num_subclass: the number of subclasses which has at least one channel passing the both tests
# '''
# num_subclass = 0 # set it zero
# # query the clustered target samples and null samples
# df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset)
# # get a list of subclass labels (non-duplicated)
# SubClass_labels = df_target.SubclassLabel.unique().astype('int')
# for Sub_class_Label in SubClass_labels: # iterate though subclasses
# df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1)
# # binomial test
# list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence)
# # Welch t-test
# channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence)
# if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
# num_subclass += 1 # count 1
# return num_subclass
[docs] def Determine_number_of_subclass(self, Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence):
'''
description:
1. query the clustered target samples
2. query null samples
3. perform one-sided binomial test and one-sided Welch t-test on each subclass
4. count the number of subclasses which has at least one channel passing the both tests
USAGE: num_subclass = Determine_number_of_subclass(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset, test_confidence)
:param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples
:param Path_Null_Dataset: a path to null samples
:param test_confidence: a statistical confidence level
:return: num_subclass: the number of subclasses which has at least one channel passing the both tests
'''
num_subclass = 0 # set it zero
# query the clustered target samples and null samples
df_target, df_null = self.query_targetglitch_null(Path_Target_Glitch_SubClassClustered_Dataset, Path_Null_Dataset)
# estimate beta distributions
rv_t_dict, rv_n_dict = self.perform_beta_dist(df_target, df_null)
# get a list of subclass labels (non-duplicated)
SubClass_labels = df_target.SubclassLabel.unique().astype('int')
for Sub_class_Label in SubClass_labels: # iterate though subclasses
df_target_sub = df_target[df_target.SubclassLabel == Sub_class_Label].drop(['SubclassLabel'], axis=1)
channels, p_greater_dict, list_p_greater, list_p_greater_passed, list_p_greater_failed, list_p_greater_Test, confidence_level_used = self.perform_p_greater(df_target_sub, rv_t_dict, rv_n_dict, test_confidence)
if list_p_greater_Test[list_p_greater_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
num_subclass += 1 # count 1
# binomial test
# list_Causal_passed, list_Causal_fail, list_causal_passed_err, list_causal_failed_err, list_binomi_Test, list_channels = self.find_meaing_ful_confidence(df_target_sub, df_null, test_confidence)
# Welch t-test
# channels, list_t_values_passed, list_t_values_failed, list_t_Test, confidence_level_used = self.perform_Welch_test(df_target_sub, df_null, test_confidence)
#if list_binomi_Test[list_binomi_Test == 'pass'].shape[0] > 0 and list_t_Test[list_t_Test == 'pass'].shape[0] > 0: # if there is a least one channel that pass the both tests
# num_subclass += 1 # count 1
return num_subclass
[docs] def Get_MatCountGPSDuration(self, Path_Target_Glitch_SubClassClustered_Dataset):
'''
description:
get a importance matrix, a list of channels, a list of GPS times, and a list of durations from the clustered target samples
:param Path_Target_Glitch_SubClassClustered_Dataset: a path to the clustered target samples
:return:
MatCount: a importance matrix
ListChannelName: a list of channels
ListGPS: a list of GPS times
ListDuration: a list of durations
'''
df = pd.read_csv(Path_Target_Glitch_SubClassClustered_Dataset)
try:
ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).columns.tolist())
except ValueError:
ListChannelName = np.array(df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS'], axis=1).columns.tolist())
ListGPS = df['GPS'].values
ListDuration = df['duration'].values
MatCount = df.drop(['Unnamed: 0', 'SubclassLabel', 'GPS', 'duration'], axis=1).values
return MatCount, ListChannelName, ListGPS, ListDuration
[docs] def Principal_component_analysis(self, MatCount, frac_component=0.9):
'''
description:
PCA decomposition applided to feature matrix
USAGE: MatCount, MatCount_inverse_pca = Principal_component_analysis(self, MatCount, frac_component=0.9)
:param MatCount: a feature matrix of glitches with samples in rows and features in columns
:param frac_component: a cumulative variance
:return:
MatCount_pca: a feature matix in PCA space
MatCount_inverse_pca: a reconstructed feature matrix in the original space
'''
pca = PCA()
pca.fit(MatCount)
cum_variance = np.cumsum(pca.explained_variance_ratio_)
value_cut = cum_variance[cum_variance > frac_component][0]
index_cut = np.where(cum_variance== value_cut)[0][0]
pca = PCA(n_components=index_cut)
MatCount_pca = pca.fit_transform(MatCount)
MatCount_inverse_pca = pca.inverse_transform(MatCount_pca[:, :index_cut])
# plt.imshow(MatCount, aspect='auto')
# plt.tight_layout()
# plt.savefig('orginal.png')
# plt.close()
#
# plt.imshow(MatCount_pca, aspect='auto')
# plt.savefig('pca.png')
# plt.close()
#
# plt.imshow(MatCount_inverse_pca, aspect='auto',)
# plt.savefig('inverse_pca.png')
# plt.close()
return MatCount_pca, MatCount_inverse_pca
[docs] def FindSubClass(self, MatCount, ListChannelName, ListGPS, ListDuration, output_dir, upper_number_cluster, applied_Transformation):
'''
description:
1. use a clustering approach
2. plot glitch index VS channel grouped by clusters
3. make a table comprising a list of GPS times a given cluster
4. plot Importance VS channel of a given sub-class
5. make a corresponding table
USAGE: FindSubClass(MatCount, ListChannelName, ListGPS, output_dir)
:param MatCount: a matrix comprising importance versus channels
:param ListChannelName: a list of channel names
:param ListGPS: a list of GPS times
:param ListDuration: a list of durations
:param output_dir:
:param upper_number_cluster: a value of upper limit of the number of clusters
:param applied_Transformation: decomposition applided {'PCA', 'kernelPCA', 'None'}
:return: None
'''
if MatCount.shape[0] == MatCount.shape[-1]: # if only a glitch, this plot is not useful
print('There is only one single glitch. Skip finding sub-classes')
return None
# limit the upper_number_cluster as it should be below a number of the sample
if MatCount.shape[0] < upper_number_cluster:
upper_number_cluster = MatCount.shape[0]
print('Finding sub-classes by Gaussian mixture clustering')
#-------------------------------------------------+
# use Gaussian mixture model to get sub-classes |
#-------------------------------------------------+
# PCA or kernel PCA applied or not
if applied_Transformation == 'PCA':
MatCount_pca, MatCount_inverse_pca = self.Principal_component_analysis(MatCount)
X = MatCount_pca
elif applied_Transformation == 'None':
X = MatCount
lowest_bic = np.infty
bic = []
n_components_range = range(1, upper_number_cluster+1)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
for n_components in n_components_range:
# Fit a Gaussian mixture with EM
gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type)
gmm.fit(X)
bic.append(gmm.bic(X))
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
clf = best_gmm
Y_ = clf.predict(X) # label of the clusters
ListUniqueLable = np.unique(Y_) # unique labels
DictMatCount = {} # a dictionary having matrices for a given cluster label
for UniqueLabel in ListUniqueLable:
indecies = np.where(Y_ == UniqueLabel)[0]
ClusteredMatCount = MatCount[indecies]
DictMatCount[str(UniqueLabel)] = ClusteredMatCount
print('{} sub-classes have been found'.format(len(DictMatCount)))
#----------------------------------------------------------+
# plot of occurrence VS channel grouped by sub-classes |
#----------------------------------------------------------+
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# get xticks for dominant channels
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannelName)
fig = plt.figure()
# total grid of sub-plots
firstratio = int(0.1 * MatCount.shape[0])
# if the number of sample is less than 10, firstratio becomes 0. To cope with this issue, I manually replace it with 1
if firstratio == 0:
firstratio = 1
totalrow = MatCount.shape[0] + firstratio
# make grid for subplots
gs = GridSpec(totalrow, 20)
# a subplot showing channel position
ax0 = fig.add_subplot(gs[:firstratio, :-1]) # First row, first column
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if edgeRight == MatCount.shape[-1] - 1: # adjust the rightest color span
edgeRight = edgeRight + 1
ax0.axvspan(edgeLeft-0.5, edgeRight-0.5, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
# limit the xaxis of the first subplot
ax0.axis(xmin=-0.5, xmax=round(MatCount.shape[-1] * 1.001) - 1 + 0.5)
# get rid of ticks of the first plot
ax0.yaxis.set_major_locator(plt.NullLocator())
ax0.xaxis.set_major_locator(plt.NullLocator())
ax0.set_xticks(CenterTicks)
ax0.set_xticklabels(ListSubsys[ListInd.astype(int)])
ax0.tick_params(axis="x", labelbottom=False, labeltop=True, direction='in', pad=-20, length=0, labelrotation=90, size=0, labelsize=10)
ax0.grid(False)
StartRow = firstratio # first row number of a sub-plot
SubClassInd = 0
for SubClassLabel, SubMatCount in sorted(DictMatCount.items()): # iterate through sub-classes
NumRow = SubMatCount.shape[0] # the number of row for this sub-class
EndRow = StartRow + NumRow # the ending row number
# assign a size for the second subplot
ax = fig.add_subplot(gs[StartRow:EndRow, :-1])
ax.imshow(SubMatCount, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, origin='upper', vmin=MatCount.min(), vmax=MatCount.max())
ax.yaxis.set_label_position("right")
# adjust y-ticks size
ax.tick_params(axis="y", labelsize=10)
ax.axis(xmin=-0.5, xmax=round(SubMatCount.shape[-1] * 1.001)-1+0.5)
#ax.set_ylabel('sub{}'.format(SubClassLabel), fontsize=10, rotation=0)
ax.text(SubMatCount.shape[-1]*1.001, SubMatCount.shape[0]/2., 'sub{}'.format(SubClassLabel), ha='left', va='bottom', rotation=90., fontsize=10)
if SubMatCount.shape[0] == 1:
ax.set_yticks([0])
else:
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
ax.set_xticks([]) # delete ticks
# get rig of grid
ax.grid(False)
SubClassInd += 1 # update for subclass index
StartRow = EndRow # update for a next one
fig.subplots_adjust(hspace=int(0.005*totalrow))
fig.subplots_adjust(wspace=int(3))
ax.set_xlabel('Auxiliary channel', fontsize=15)
fig.text(0.05, 0.5, 'Glitch index', ha='center', va='center', rotation=90., fontsize=15)
###### Make a color bar
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=MatCount.min(), vmax=MatCount.max())
ax3 = fig.add_subplot(gs[1:, -1])
cbar = matplotlib.colorbar.ColorbarBase(ax3, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax3.grid(False)
cbar.set_label('Importance', size=10)
########
output_path = os.path.join(output_dir, 'GlitchIndexVSChannelClustered.pdf')
fig.savefig(output_path)
plt.close()
#-------------------------------------------------------------------------------+
# make .csv file comprising SubClassLabel, GPS, importance of all the channels |
#-------------------------------------------------------------------------------+
# matrix of SubClass Label and corresponding GPS times
MatSubClassLabelGPSDuration = np.concatenate((Y_.reshape(Y_.shape[0], 1), ListGPS.reshape(ListGPS.shape[0], 1), ListDuration.reshape(ListDuration.shape[0], 1)), axis=1)
# concatenate subclass labels and a list of GPS times and importance of all the channels
MatSubClassLabelGPSDurationImportanceChannels = np.concatenate((MatSubClassLabelGPSDuration, MatCount), axis=1)
# make a list of column names
columnName = np.append(np.array(['SubclassLabel', 'GPS', 'duration']), ListChannelName)
# convert it to a pandas frame
dfGPSDurationImportanceChannels = pd.DataFrame(MatSubClassLabelGPSDurationImportanceChannels, columns=columnName)
path_output_fileGPSImportanceChannels = os.path.join(output_dir, 'ClusteredGPSDurationImportanceChannels.csv')
# save it as .csv file
dfGPSDurationImportanceChannels.to_csv(path_output_fileGPSImportanceChannels)
print('Completed saving a table comprising a sub-class labels, GPS times, and importance of the channels')
#--------------------------------------------+
# plot each sub-class and make a table it |
#--------------------------------------------+
for SubClassLabel, SubMatCount in sorted(DictMatCount.items()): # iterate through sub-classes
Plotoutput_file = 'SubClass{}.pdf'.format(SubClassLabel)
Tableoutput_file = 'SubClass{}.csv'.format(SubClassLabel)
# plot
self.PlotImportanceVSChannel(SubMatCount, ListChannelName, output_dir, Plotoutput_file)
# making a table
self.TableImportance(SubMatCount, ListChannelName, output_dir, Tableoutput_file)
print('Completed plotting and making a table of sub-class {}'.format(SubClassLabel))
print('Sub-class analysis is done')
return None
[docs] def AutoDetermineTrendBin(self, SegmentStart, SegmentEnd):
'''
description: automatically determine the bins of the subclass trend plot
USAGE: trend = AutoDetermineTrendBin(SegmentStart, SegmentEnd)
:param SegmentStart: start time of a segment
:param SegmentEnd: end time of a segment
:return: trend:
{'mins', 'hours', 'days', 'month'}
'''
length = SegmentEnd - SegmentStart # segment length in second
if length < 3 * 60 * 60: # if the length is shorter than 3 hours
trend = 'mins'
elif length >= 3 * 60 * 60 and length < 3 * 24 * 60 * 60: # if the length is longer than 3 hours and shorter than 3 days
trend = 'hours'
elif length >= 3 * 24 * 60 * 60 and length < 3 * 30 * 24 * 60 * 60: # if the length longer than 3 days and shorter than 3 months
trend = 'days'
elif length >= 3 * 30 * 24 * 60 * 60: # if the length is longer than 3 months
trend = 'months'
return trend
[docs] def TrendSubClass(self, SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='months', norm=False):
'''
input file is supposed to be ClusteredGPSImportanceChannels.csv
USAGE: TrendSubClass(SegmentStart, SegmentEnd, input_dir, input_file, output_dir, trend='month')
:param SegmentStart: start GPS time of a segment
:param SegmentEnd: end GPS time of a segment
:param input_dir: input directory
:param input_file: an input file
:param output_dir: an output directory
:param trend: a trend 'months', 'days', 'hours' or 'mins'
:return: None
'''
# load an input file
input_path = os.path.join(input_dir, input_file)
try:
Subclass_df = pd.read_csv(input_path)
except IOError:
print('There is no sub-class file. Skip the trend plot')
return None
# set a trend, month or day
if trend == 'months':
TrendInSecond = float(30 * 24 * 60 * 60) # in second
elif trend == 'days':
TrendInSecond = float(24 * 60 * 60) # in second
elif trend == 'hours':
TrendInSecond = float(60 * 60) # in second
elif trend == 'mins':
TrendInSecond = float(60) # in second
# add a column having time from the beginning of a segment in a trend scale
Subclass_df = Subclass_df.assign(time=(Subclass_df['GPS'] - SegmentStart) / TrendInSecond) # convert GPS time to time in a trend
# a length of this segment in a trend scale
SegmentLength = (SegmentEnd - SegmentStart) / TrendInSecond
# bins of histogram
Bins = int(round(SegmentLength))
# non-duplicate sub-class labels
UniqueClass = np.unique(Subclass_df['SubclassLabel']).astype(int)
# make a list of each sub-class trend
SubClassSeries = [Subclass_df[Subclass_df['SubclassLabel'] == subclass]['time'] for subclass in UniqueClass]
LABEL = ['sub{}'.format(i) for i in UniqueClass]
# making combinations of colors and linestyles
ListColor = ['blue', 'red', 'cyan', 'gray', 'green', 'yellow']
#ListLineStyle = ['-', '--', ':', '-.']
ListLineStyle = ['-']
ColorStyleCombination = list(itrtls.product(ListColor, ListLineStyle))
rd.shuffle(ColorStyleCombination)
# make a histogram
if norm == False:
for index in range(len(LABEL)):
Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=False) # without normalization
d_edge = edge[1] - edge[0]
Rate = Count/float(d_edge)
#plt.plot(edge[:-1], Rate, drawstyle='steps-mid', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index])
plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index])
plt.ylabel('Rate [/{}]'.format(trend.split('s')[0]), fontsize=15)
if norm == True:
for index in range(len(LABEL)):
Count, edge = np.histogram(SubClassSeries[index], range=(0, SegmentLength), bins=Bins, density=True) # without normalization
d_edge = edge[1] - edge[0]
Rate = Count/float(d_edge)
#plt.plot(edge[:-1], Rate, drawstyle='steps', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], label=LABEL[index])
plt.plot(edge[:-1], Rate, marker='o', color=ColorStyleCombination[index][0], ls=ColorStyleCombination[index][1], alpha=0.5, label=LABEL[index])
plt.ylabel('Normalized rate [/{}]'.format(trend), fontsize=15)
TIME = tconvert(SegmentStart)
plt.xlabel('Time [{0}] from {1}-{2:02d}-{3:02d} {4:02d}:{5:02d}:{6:02d} UTC'.format(trend, TIME.year, TIME.month, TIME.day, TIME.hour, TIME.minute, TIME.second), fontsize=15)
plt.tick_params(axis="x", labelsize=10)
plt.tick_params(axis="y", labelsize=10)
plt.legend()
if trend == 'months':
output_file = 'SubClassTrend_Monthly.pdf'
elif trend == 'days':
output_file = 'SubClassTrend_Dayly.pdf'
elif trend == 'hours':
output_file = 'SubClassTrend_Hourly.pdf'
elif trend == 'mins':
output_file = 'SubClassTrend_minutes.pdf'
output_path = os.path.join(output_dir, output_file)
plt.savefig(output_path)
plt.close()
return None
[docs] def make_subset_channel_based_on_samplingrate(self, g, target_sampling_rate):
'''
dependencies being upon: PlotIndividualFCS_ImportanceVSChannel()
Usage: X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = make_subset_channel_based_on_samplingrate(f['gps00000'], 256)
:param g: target glitch class's group or a file itself (at a GPS time), HDF5 format
:param target_sampling_rate: Sampling rate used to group them, (256, 512, 1024, 2048, 4096, 8192, 16384
:return X: matrix comprising some channels with a same samplign rate of
a target glitch class at a given time whitened a reference, only
duration: duration of time series
Listch_label_num: the list of channel labels
ListChannel: the list of name of channels
GPS: GPS time of this group
SNR: SNR of this glitch
confidence: confidence level of this glitch
ID: GravitySpy uniqu ID
'''
GPS = g.attrs['GPS'] # GPS time
try:
SNR = g.attrs['snr'] # SNR
confidence = g.attrs['confidence'] # confidence level
try:
ID = g.attrs['id'].decode('utf-8') # GravitySpy id
except AttributeError:
ID = None
except KeyError:
SNR = 0
confidence = 0
ID = 'None'
Listch_label_num = [] # a list having the channel labels
ListChannel = [] # a list having the name of channels
X = np.array([])
typeSeg = 'T'
for ch_label in g.keys():
if str(ch_label)[2] == typeSeg:
dset = g[ch_label]
sr = dset.attrs['SamplingRate']
duration = dset.attrs['duration']
channel = dset.attrs['channel'].decode('utf-8')
########## remove noisemon channels #############
try:
if channel.split('-')[1].split('_')[2] == 'NOISEMON':
continue
except IndexError:
pass
######################
if sr == target_sampling_rate:
spT = dset.value # whitened FFT values
if np.isnan(spT[0]) == True:
spT = np.zeros(spT.shape[0])
pT = spT#**2 # whitened (Amplitude) Power series
pT = pT.reshape((1, pT.shape[0])) # reshape the vector suitable to make a matrix
# print(white_scale_psd.shape)
X = np.vstack([X, pT]) if X.size else pT
Listch_label_num.append(int(ch_label.split('ch{}'.format(typeSeg))[1]))
ListChannel.append(channel)
else:
continue
return X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID
[docs] def PlotIndividualFCS_ImportanceVSChannel(self, glitchtype, IFO, GravitySpy_df, output_dir, sigma, LowerCutOffFreq='None', UpperCutOffFreq='None', mode='offline', Listsegments=None, re_sfchs=None, Data_outputpath=None, Data_outputfilename=None, PlusHOFT='False', number_process=None):
'''
description:
1. load a file comprising all glitches in a class
2. create a plot comprising frequency versus channel & importance versus channel
- dependency:
self.CreateChannelTicks(ListChannel)
self.make_subset_channel_based_on_samplingrate()
self.CreateMatCount()
self.PlotImportanceVSChannel()
3. save a plot
dependencies: make_subset_channel_based_on_samplingrate(), CreateChannelTicks(), CreateMatCount(), PlotImportanceVSChannel()
USAGE: PlotIndividualFCS_ImportanceVSChannel(glitchtype, IFO, output_dir, sigma, LowerCutOffFreq, UpperCutOffFreq)
:param glitchtype: a type of glitch used for create a name of a plot
:param IFO: # a type of IFO used in a name of a plot
:param GravitySpy_df: a meta data of Gravity Spy in pandas frame
:param output_dir: a output directory
:param simga: an integer number used for the upper bound of BG noise
:param LowerCutOffFreq: the lower cut-off frequency used in CreateMatCount, 'None' in default
:param UpperCutOffFreq: the upper cut-off frequency used in CreateMatCount, 'None' in default
:param mode: 'offline' or 'online'
:param Listsegments: a list of allowed glitches, which is used for online mode only, None in default
:param re_sfchs: a list of safe channels except unused channels, which is used for online mode only, None in default
:param Data_outputpath: a directory saving for a HDF5 file, which is used for online mode only, None in default
:param Data_outputfilename: a file saving for a HDF5 file, which is used for online mode only, None in default
:param PlusHOFT: whether to get data of HOFT {'True', 'False'}, which is used for online mode only, 'False' in default
:param number_process: a number of processes in parallel, which is used for online mode only, None in default
return None
'''
if mode == 'offline':
for group_label in list(self.f.keys()): # iterate though all glitches in a class
g = self.f[group_label]
if len(g.keys()) == 0: # if no dataset in this group, go to a next iteration
continue
if len(g.keys()) / 3 == int: # if there are missing channels, go to a next iteration
continue
#++++++++++ determine color upper and lower values +++++++++++++++++
SmallestValue = 1 # dummy global minimum
LargestValue = 0 # dummy global maximum
for ch_label in g.keys(): # iterate through channel data sets
typeSeg = 'T' # only for target data sets
if str(ch_label)[2] == typeSeg: # take only target data sets
dset = g[ch_label] # get data set
localMin = dset.value.min() # find a local minima
localMax = dset.value.max() # find a local maxima
if localMin <= SmallestValue:
SmallestValue = localMin # replace a global minimum with a local minima
if LargestValue <= localMax:
LargestValue = localMax # replace a global maximum with a local maxima
# set figure size
fig = plt.figure(figsize=(25, 20))
# set outer grid of this plot
Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5)
outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0)
#++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++
for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates
# make matrix having channels in row, frequency in column, and list of extra paramters
X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate)
if X.shape[0] != 0:
# transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom
X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top
#+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# make grid of an inner plot
inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
# plot the color indicator for channel on the top of this plot
ax1 = fig.add_subplot(inner[:1, 0]) # First row, first column
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel)
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer
ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
else:
ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax1.axis(xmin=0, xmax=X.shape[-1])
ax1.set_xticks([]) # make ticks disappear
ax1.set_yticks([]) # make ticks disappear
#++++++++++ make a frequency versus channel plot +++++++++++++++++++
ax2 = fig.add_subplot(inner[1:, 0]) # First row, first column
ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper')
#ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestValue)) # log scale
# make frequency ticks to be correct
cutoff_freq_left = 0
cutoff_freq_right = target_samplerate / 2
interval = 5
y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks
ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations
ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values
ax2.set_xticks(CenterTicks) # locate x ticks
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names
ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters
ax2.grid(False)
# write y label only for the left pannel as the rest are duplicated
if fig_num == 0:
plt.ylabel('Frequency [Hz]', fontsize=20)
elif X.shape[0] == 0:
inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
ax2 = fig.add_subplot(inner[:, 0])
ax2.set_xticks([])
ax2.set_yticks([])
ax2.grid(False)
if fig_num == 0:
plt.ylabel('Frequency [Hz]', fontsize=20)
###### Make a color bar
inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5)
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue)
ax_bar = fig.add_subplot(inner2[0])
cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax_bar.grid(False)
cbar.set_label('Whitened FFT', size=10)
# get a URL of Q-transform of h(t)
try:
URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0]
except IndexError: # deal with the one with no Q-transport plot of h(t)
URL = 'None'
# give an overall title
if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20)
elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20)
elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20)
elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20)
# x label for the above plot
fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20)
#++++++++ plot importance versus channel
inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5)
ax = fig.add_subplot(inner3[0]) # First row, first column
# create a count matrix of this glitch
MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq)
# actually plot it
self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax)
output_file = '{}.pdf'.format(str(GPS).replace('.', '-'))
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed plotting for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS)))
elif mode == 'online':
for g in SaveTargetAndBackGroundHDF5_ONLINE(Listsegments, re_sfchs, IFO, Data_outputpath, Data_outputfilename, number_process, PlusHOFT): # iterate group by group
#++++++++++ determine color upper and lower values +++++++++++++++++
SmallestValue = 1 # dummy global minimum
LargestValue = 0 # dummy global maximum
for ch_label in g.keys(): # iterate through channel data sets
typeSeg = 'T' # only for target data sets
if str(ch_label)[2] == typeSeg: # take only target data sets
dset = g[ch_label] # get data set
localMin = dset.value.min() # find a local minima
localMax = dset.value.max() # find a local maxima
if localMin <= SmallestValue:
SmallestValue = localMin # replace a global minimum with a local minima
if LargestValue <= localMax:
LargestValue = localMax # replace a global maximum with a local maxima
# set figure size
fig = plt.figure(figsize=(25, 20))
# set outer grid of this plot
Base_outer = GridSpec(2, 50, wspace=0.3, hspace=0.5)
outer = GridSpecFromSubplotSpec(1, 7, subplot_spec=Base_outer[:49], wspace=0.3, hspace=0)
#++++++++++++ plot frequency vs channels grouped based on their sampling rates ++++++++++++++++++++
for fig_num, target_samplerate in enumerate([256, 512, 1024, 2048, 4096, 8192, 16384][:], 0): # iterate though all the sampling rates
# make matrix having channels in row, frequency in column, and list of extra paramters
X, duration, Listch_label_num, ListChannel, GPS, SNR, confidence, ID = self.make_subset_channel_based_on_samplingrate(g, target_samplerate)
# transpose X (X.T) make this matrix having frequency in row, channel in column, but the frequency increases from the top to the bottom
X = X.T[::-1, :] # turn up side down by [::-1,:], to make the frequency values go from the bottom to the top
#+++++++++++ plot color indicator for channels on the top +++++++++++++++++++++++++++++
# color the background either with blakc or gray for each channel
Listcolor = ['black', 'gray'] * 20 * 7
# make grid of an inner plot
inner = GridSpecFromSubplotSpec(15, 1, subplot_spec=outer[fig_num], wspace=0.1, hspace=0)
# plot the color indicator for channel on the top of this plot
ax1 = fig.add_subplot(inner[:1, 0]) # First row, first column
CenterTicks, ListInd, ListSubsys = self.CreateChannelTicks(ListChannel)
for color_ind, (edgeLeft, edgeRight) in enumerate(zip(ListInd[:-1], ListInd[1:]), 0):
if color_ind == ListInd.shape[-1] - 2: # add a cell for at the end to make a plot nicer
ax1.axvspan(edgeLeft, edgeRight + 1, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
else:
ax1.axvspan(edgeLeft, edgeRight, facecolor=Listcolor[color_ind], alpha=0.5, edgecolor=None)
ax1.axis(xmin=0, xmax=X.shape[-1])
ax1.set_xticks([]) # make ticks disappear
ax1.set_yticks([]) # make ticks disappear
#++++++++++ make a frequency versus channel plot +++++++++++++++++++
ax2 = fig.add_subplot(inner[1:, 0]) # from the second row, first column
ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, vmin=SmallestValue, vmax=LargestValue, origin='upper')
#ax2.imshow(X, aspect='auto', interpolation='nearest', cmap=plt.cm.viridis, norm=LogNorm(vmin=SmallestValue, vmax=LargestVlaue)) # log scale
# make frequency ticks to be correct
cutoff_freq_left = 0
cutoff_freq_right = target_samplerate / 2
interval = 5
y_tics = np.linspace(cutoff_freq_left, cutoff_freq_right, interval, endpoint=True)[::-1].astype(int) # new ticks
ax2.set_yticks(np.linspace(0, X.shape[0] - 1, interval, endpoint=True)) # set y ticks locations
ax2.set_yticklabels(y_tics, size=15) # replace y ticks with a frequency values
ax2.set_xticks(CenterTicks) # locate x ticks
ax2.set_xticklabels(ListSubsys[ListInd.astype(int)]) # replace x ticks with channel names
ax2.xaxis.set_tick_params(labelbottom=False, labeltop=True, direction='in', pad=3, length=0, labelrotation=90, size=0, labelsize=10) # set x ticks paramters
ax2.grid(False)
# write y label only for the left pannel as the rest are duplicated
if fig_num == 0:
plt.ylabel('Frequency [Hz]', fontsize=20)
###### Make a color bar
inner2 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[49], wspace=0.01, hspace=0.5)
cmap = matplotlib.cm.viridis
norm = matplotlib.colors.Normalize(vmin=SmallestValue, vmax=LargestValue)
ax_bar = fig.add_subplot(inner2[0])
cbar = matplotlib.colorbar.ColorbarBase(ax_bar, cmap=cmap, norm=norm, orientation='vertical', drawedges=False)
ax_bar.grid(False)
cbar.set_label('Whitened FFT', size=10)
#### make a title ######
# get a URL of Q-transform of h(t)
try:
URL = GravitySpy_df[GravitySpy_df['id'] == ID]['imgUrl'].tolist()[0]
except IndexError: # deal with the one with no Q-transport plot of h(t)
URL = 'None'
# give an overall title
if LowerCutOffFreq == 'None' and UpperCutOffFreq == 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s\n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, URL), fontsize=20)
elif LowerCutOffFreq != 'None' and UpperCutOffFreq == 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}- Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), URL), fontsize=20)
elif LowerCutOffFreq == 'None' and UpperCutOffFreq != 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = -{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(UpperCutOffFreq), URL), fontsize=20)
elif LowerCutOffFreq != 'None' and UpperCutOffFreq != 'None':
plt.suptitle('{}, SNR = {:.02f}, confidence = {:.02f}, \n GPS = {:.02f}, ID = {}, duration = {} s, band = {}-{} Hz \n Q-transform of h(t): {}'.format(glitchtype, SNR, confidence, GPS, ID, duration, int(LowerCutOffFreq), int(UpperCutOffFreq), URL), fontsize=20)
# x label for the above plot
fig.text(0.5, 0.55, 'Auxiliary channel', ha='center', va='center', fontsize=20)
#++++++++ plot importance versus channel
inner3 = GridSpecFromSubplotSpec(1, 1, subplot_spec=Base_outer[50:], wspace=0.1, hspace=0.5)
ax = fig.add_subplot(inner3[0]) # First row, first column
# create a count matrix of this glitch
MatCount, ListChannelName, ListSNR, ListConf = self.CreateMatCount(sigma, g, LowerCutOffFreq, UpperCutOffFreq)
# actually plot it
self.PlotImportanceVSChannel(MatCount, ListChannelName, None, None, ax=ax)
output_file = '{}.pdf'.format(str(GPS).replace('.', '-'))
output_path = os.path.join(output_dir, output_file)
fig.savefig(output_path)
plt.close()
print('Completed saving a plot for a(n) {} glitch in {} at {}'.format(glitchtype, IFO, str(GPS)))
return None
[docs] def FrequencyBandColor(self, PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High):
'''
description:
1. load files that have a low, middle, high frequency band importances. (GPSImportanceChannels.csv)
2. make a matrix that has RGB color based on each importance
3. output this matrix
USAGE: RGBMat = FrequencyBandColor(PathGlitchChannel_Low, PathGlitchChannel_Mid, PathGlitchChannel_High)
:param PathGlitchChannel_Low: a path to a file that has a low frequency band
:param PathGlitchChannel_Mid: a path to a file that has a middle frequency band
:param PathGlitchChannel_High: a path to a file that has a high frequency band
:return: RGBMat
'''
### load data files
try: # for low frequency band data
dfLow = pd.read_csv(PathGlitchChannel_Low)
dfLow = dfLow.drop(['Unnamed: 0', 'GPS'], axis=1)
except IOError:
print('There is not such a file in {}'.format(PathGlitchChannel_Low))
sys.exit()
try: # for mid frequency band data
dfMid = pd.read_csv(PathGlitchChannel_Mid)
dfMid = dfMid.drop(['Unnamed: 0', 'GPS'], axis=1)
except IOError:
print('There is not such a file in {}'.format(PathGlitchChannel_Mid))
sys.exit()
try: # for high frequency band data
dfHigh = pd.read_csv(PathGlitchChannel_High)
dfHigh = dfHigh.drop(['Unnamed: 0', 'GPS'], axis=1)
except IOError:
print('There is not such a file in {}'.format(PathGlitchChannel_High))
sys.exit()
# make a matrix with all the elements equal to zero
RGBMat = np.zeros((dfLow.values.shape[0], dfLow.values.shape[1], 3))
# fill the each value as RGB colors
# low freq is red, middle freq is green, high freq is blue
for row_num in range(dfLow.shape[0]):
for col_num in range(dfLow.shape[1]):
RGBMat[row_num, col_num, 0] = dfLow.values[row_num, col_num]
RGBMat[row_num, col_num, 1] = dfMid.values[row_num, col_num]
RGBMat[row_num, col_num, 2] = dfHigh.values[row_num, col_num]
return RGBMat