293#include "TRestDataSet.h"
296#include "TRestTools.h"
339 if (
fTree !=
nullptr) {
340 RESTWarning <<
"Tree has already been loaded. Skipping TRestDataSet::GenerateDataSet ... "
349 RESTError <<
"File selection is empty " <<
RESTendl;
355 std::vector<std::string> finalList;
356 finalList.push_back(
"runOrigin");
357 finalList.push_back(
"eventID");
358 finalList.push_back(
"timeStamp");
362 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
363 finalList.push_back(obs);
365 RESTWarning <<
" Observable " << obs <<
" not found in observable list, skipping..." <<
RESTendl;
369 for (
const auto& name : obsNames) {
371 if (name.find(pcs) == 0) finalList.push_back(name);
376 std::sort(finalList.begin(), finalList.end());
377 finalList.erase(std::unique(finalList.begin(), finalList.end()), finalList.end());
380 ROOT::EnableImplicitMT();
382 ROOT::DisableImplicitMT();
384 RESTInfo <<
"Initializing dataset" <<
RESTendl;
387 RESTInfo <<
"Making cuts" <<
RESTendl;
392 RESTInfo <<
"Adding column to dataset: " << cName <<
RESTendl;
393 finalList.emplace_back(cName);
397 RESTInfo <<
"Generating snapshot." <<
RESTendl;
398 std::string user = getenv(
"USER");
399 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
400 fDataSet.Snapshot(
"AnalysisTree", fOutName, finalList);
402 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
403 fDataSet = ROOT::RDataFrame(
"AnalysisTree", fOutName);
405 TFile* f = TFile::Open(fOutName.c_str());
406 fTree = (TChain*)f->Get(
"AnalysisTree");
408 RESTInfo <<
" - Dataset generated!" <<
RESTendl;
420 if (!time_stamp_end || !time_stamp_start) {
421 RESTError <<
"TRestDataSet::FileSelect. Start or end dates not properly formed. Please, check "
422 "REST_StringHelper::StringToTimeStamp documentation for valid formats"
429 RESTInfo <<
"TRestDataSet::FileSelection. Starting file selection." <<
RESTendl;
430 RESTInfo <<
"Total files : " << fileNames.size() <<
RESTendl;
431 RESTInfo <<
"This process may take long computation time in case there are many files." <<
RESTendl;
434 std::cout <<
"Processing file selection.";
436 for (
const auto& file : fileNames) {
437 if (cnt % 100 == 0) {
438 std::cout << std::endl;
439 std::cout <<
"Files processed: " << cnt <<
" ." << std::flush;
443 std::cout <<
"." << std::flush;
444 double runStart = run.GetStartTimestamp();
445 double runEnd = run.GetEndTimestamp();
447 if (runStart < time_stamp_start || runEnd > time_stamp_end) {
448 RESTInfo <<
"Rejecting file out of date range: " << file <<
RESTendl;
455 std::string mdValue = run.GetMetadataMember(md);
458 if (mdValue.find(
fFilterContains[n]) == std::string::npos) accept =
false;
473 if (!accept)
continue;
476 for (
auto& [name, properties] :
fQuantity) {
480 if (properties.strategy ==
"accumulate") {
482 properties.value = StringWithPrecision(val, 2);
485 if (properties.strategy ==
"max")
487 properties.value = value;
489 if (properties.strategy ==
"min")
491 properties.value = value;
493 if (properties.strategy ==
"unique") {
494 if (properties.value.empty())
495 properties.value = value;
496 else if (properties.value != value) {
497 RESTWarning <<
"TRestDataSet::FileSelection. Relevant quantity retrieval." <<
RESTendl;
498 RESTWarning <<
"A unique metadata member used for the `" << name
499 <<
"` quantity is not unique!!" <<
RESTendl;
500 RESTWarning <<
"Pre-registered value : " << properties.value <<
" New value : " << value
505 if (properties.strategy ==
"last") properties.value = value;
512 fTotalDuration += run.GetEndTimestamp() - run.GetStartTimestamp();
515 std::cout << std::endl;
529 if (cut ==
nullptr)
return df;
531 auto paramCut = cut->GetParamCut();
532 auto obsList = df.GetColumnNames();
533 for (
const auto& [param, condition] : paramCut) {
534 if (std::find(obsList.begin(), obsList.end(), param) != obsList.end()) {
535 std::string pCut = param + condition;
536 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
537 df = df.Filter(pCut);
539 RESTWarning <<
" Cut observable " << param <<
" not found in observable list, skipping..."
544 auto cutString = cut->GetCutStrings();
545 for (
const auto& pCut : cutString) {
547 for (
const auto& obs : obsList) {
548 if (pCut.find(obs) != std::string::npos) {
549 RESTDebug <<
"Applying cut " << pCut <<
RESTendl;
550 df = df.Filter(pCut);
557 RESTWarning <<
" Cut string " << pCut <<
" not found in observable list, skipping..." <<
RESTendl;
579 std::string evalFormula = formula;
580 for (
auto const& [name, properties] :
fQuantity)
583 df = df.Define(columnName, evalFormula);
606 RESTMetadata <<
" Single observables added:" <<
RESTendl;
607 RESTMetadata <<
" -------------------------" <<
RESTendl;
614 RESTMetadata <<
" Process observables added: " <<
RESTendl;
615 RESTMetadata <<
" -------------------------- " <<
RESTendl;
622 RESTMetadata <<
" Metadata filters: " <<
RESTendl;
623 RESTMetadata <<
" ----------------- " <<
RESTendl;
628 RESTMetadata <<
" - " << mdFilter <<
".";
643 RESTMetadata <<
" Relevant quantities: " <<
RESTendl;
644 RESTMetadata <<
" -------------------- " <<
RESTendl;
646 for (
auto const& [name, properties] :
fQuantity) {
647 RESTMetadata <<
" - Name : " << name <<
". Value : " << properties.value
648 <<
". Strategy: " << properties.strategy <<
RESTendl;
649 RESTMetadata <<
" - Metadata: " << properties.metadata <<
RESTendl;
650 RESTMetadata <<
" - Description: " << properties.description <<
RESTendl;
656 RESTMetadata <<
" New columns added to generated dataframe: " <<
RESTendl;
657 RESTMetadata <<
" ---------------------------------------- " <<
RESTendl;
659 RESTMetadata <<
" - Name : " << cName <<
RESTendl;
660 RESTMetadata <<
" - Expression: " << cExpression <<
RESTendl;
667 RESTMetadata <<
"This is a combined dataset." <<
RESTendl;
668 RESTMetadata <<
" -------------------- " <<
RESTendl;
669 RESTMetadata <<
" - Relevant quantities have been removed!" <<
RESTendl;
670 RESTMetadata <<
" - Dataset metadata properties correspond to the first file in the list."
673 RESTMetadata <<
"List of imported files: " <<
RESTendl;
674 RESTMetadata <<
" -------------------- " <<
RESTendl;
680 RESTMetadata <<
" - Multithreading was enabled" <<
RESTendl;
682 RESTMetadata <<
" - Multithreading was NOT enabled" <<
RESTendl;
694 TiXmlElement* filterDefinition =
GetElement(
"filter");
695 while (filterDefinition !=
nullptr) {
696 std::string metadata =
GetFieldValue(
"metadata", filterDefinition);
697 if (metadata.empty() || metadata ==
"Not defined") {
698 RESTError <<
"Filter key defined without metadata member!" <<
RESTendl;
704 std::string contains =
GetFieldValue(
"contains", filterDefinition);
705 if (contains ==
"Not defined") contains =
"";
719 TiXmlElement* observablesDefinition =
GetElement(
"observables");
720 while (observablesDefinition !=
nullptr) {
721 std::string observables =
GetFieldValue(
"list", observablesDefinition);
722 if (observables.empty() || observables ==
"Not defined") {
723 RESTError <<
"<observables key does not contain a list!" <<
RESTendl;
735 TiXmlElement* obsProcessDefinition =
GetElement(
"processObservables");
736 while (obsProcessDefinition !=
nullptr) {
737 std::string observables =
GetFieldValue(
"list", obsProcessDefinition);
738 if (observables.empty() || observables ==
"Not defined") {
739 RESTError <<
"<processObservables key does not contain a list!" <<
RESTendl;
751 TiXmlElement* quantityDefinition =
GetElement(
"quantity");
752 while (quantityDefinition !=
nullptr) {
753 std::string name =
GetFieldValue(
"name", quantityDefinition);
754 if (name.empty() || name ==
"Not defined") {
755 RESTError <<
"<quantity key does not contain a name!" <<
RESTendl;
759 std::string metadata =
GetFieldValue(
"metadata", quantityDefinition);
760 if (metadata.empty() || metadata ==
"Not defined") {
761 RESTError <<
"<quantity key does not contain a metadata value!" <<
RESTendl;
765 std::string strategy =
GetFieldValue(
"strategy", quantityDefinition);
766 if (strategy.empty() || strategy ==
"Not defined") {
770 std::string description =
GetFieldValue(
"description", quantityDefinition);
784 TiXmlElement* columnDefinition =
GetElement(
"addColumn");
785 while (columnDefinition !=
nullptr) {
787 if (name.empty() || name ==
"Not defined") {
788 RESTError <<
"<define key does not contain a name name!" <<
RESTendl;
792 std::string expression =
GetFieldValue(
"expression", columnDefinition);
793 if (expression.empty() || expression ==
"Not defined") {
794 RESTError <<
"<addColumn key does not contain a expression value!" <<
RESTendl;
820 RESTInfo <<
"Exporting dataset" <<
RESTendl;
822 std::vector<std::string> columns =
fDataSet.GetColumnNames();
823 if (!excludeColumns.empty()) {
824 columns.erase(std::remove_if(columns.begin(), columns.end(),
825 [&excludeColumns](std::string elem) {
826 return std::find(excludeColumns.begin(), excludeColumns.end(),
827 elem) != excludeColumns.end();
831 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
832 std::string user = getenv(
"USER");
833 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
834 fDataSet.Snapshot(
"AnalysisTree", fOutName, columns);
836 RESTInfo <<
"Re-importing analysis tree." <<
RESTendl;
837 fDataSet = ROOT::RDataFrame(
"AnalysisTree", fOutName);
839 TFile* f = TFile::Open(fOutName.c_str());
840 fTree = (TChain*)f->Get(
"AnalysisTree");
845 if (excludeColumns.empty()) {
846 RESTInfo <<
"Re-Generating snapshot." <<
RESTendl;
847 std::string user = getenv(
"USER");
848 std::string fOutName =
"/tmp/rest_output_" + user +
".root";
849 fDataSet.Snapshot(
"AnalysisTree", fOutName);
851 TFile* f = TFile::Open(fOutName.c_str());
852 fTree = (TChain*)f->Get(
"AnalysisTree");
855 std::vector<std::string> dataTypes;
856 for (
int n = 0; n <
fTree->GetListOfBranches()->GetEntries(); n++) {
857 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
858 std::string type =
fTree->GetLeaf((TString)bName)->GetTypeName();
859 dataTypes.push_back(type);
860 if (type !=
"Double_t" && type !=
"Int_t") {
861 RESTError <<
"Branch name : " << bName <<
" is type : " << type <<
RESTendl;
862 RESTError <<
"Only Int_t and Double_t types are allowed for "
863 "exporting to ASCII table"
865 RESTError <<
"File will not be generated" <<
RESTendl;
870 FILE* f = fopen(filename.c_str(),
"wt");
872 fprintf(f,
"### TRestDataSet generated file\n");
873 fprintf(f,
"### \n");
877 fprintf(f,
"### Accumulated run time (seconds) : %lf\n",
fTotalDuration);
878 fprintf(f,
"### Accumulated run time (hours) : %lf\n",
fTotalDuration / 3600.);
879 fprintf(f,
"### Accumulated run time (days) : %lf\n",
fTotalDuration / 3600. / 24.);
885 fprintf(f,
"### Metadata filters : \n");
888 fprintf(f,
"### - %s.", md.c_str());
898 fprintf(f,
"### Relevant quantities: \n");
899 for (
auto& [name, properties] :
fQuantity) {
900 fprintf(f,
"### - %s : %s - %s\n", name.c_str(), properties.value.c_str(),
901 properties.description.c_str());
904 fprintf(f,
"### Observables list: ");
905 for (
int n = 0; n <
fTree->GetListOfBranches()->GetEntries(); n++) {
906 std::string bName =
fTree->GetListOfBranches()->At(n)->GetName();
907 fprintf(f,
" %s", bName.c_str());
911 fprintf(f,
"### Data starts here\n");
913 auto obsNames =
fDataSet.GetColumnNames();
914 std::string obsListStr =
"";
915 for (
const auto& l : obsNames) {
916 if (!obsListStr.empty()) obsListStr +=
":";
921 fTree->Draw((TString)obsListStr,
"",
"goff");
923 for (
unsigned int n = 0; n <
fTree->GetEntries(); n++) {
925 std::string bName =
fTree->GetListOfBranches()->At(m)->GetName();
926 if (m > 0) fprintf(f,
"\t");
927 if (dataTypes[m] ==
"Double_t")
928 if (bName ==
"timeStamp")
929 fprintf(f,
"%010.0lf",
fTree->GetVal(m)[n]);
931 fprintf(f,
"%05.3e",
fTree->GetVal(m)[n]);
933 fprintf(f,
"%06d", (Int_t)
fTree->GetVal(m)[n]);
941 fDataSet.Snapshot(
"AnalysisTree", filename);
943 TFile* f = TFile::Open(filename.c_str(),
"UPDATE");
944 std::string name = this->GetName();
945 if (name.empty()) name =
"mock";
946 this->
Write(name.c_str());
952 RESTInfo <<
"Dataset generated: " << filename <<
RESTendl;
959 SetName(dS.GetName());
986 auto obsNames = GetObservablesList();
988 if (std::find(obsNames.begin(), obsNames.end(), obs) != obsNames.end()) {
989 RESTError <<
"Cannot merge dataSets with different observable list " <<
RESTendl;
1012 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1017 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1018 if (file !=
nullptr) {
1019 TIter nextkey(file->GetListOfKeys());
1021 while ((key = (TKey*)nextkey())) {
1022 std::string kName = key->GetClassName();
1031 if (dS ==
nullptr) {
1032 RESTError << fileName <<
" is not a valid dataSet" <<
RESTendl;
1037 ROOT::EnableImplicitMT();
1039 ROOT::DisableImplicitMT();
1041 fDataSet = ROOT::RDataFrame(
"AnalysisTree", fileName);
1043 fTree = (TChain*)file->Get(
"AnalysisTree");
1056 for (
const auto& fN : fileNames)
1058 RESTError <<
"Datasets can only be imported from root files" <<
RESTendl;
1063 auto it = fileNames.begin();
1064 while (it != fileNames.end()) {
1065 std::string fileName = *it;
1066 TFile* file = TFile::Open(fileName.c_str(),
"READ");
1067 bool isValid =
false;
1068 if (file !=
nullptr) {
1069 TIter nextkey(file->GetListOfKeys());
1071 while ((key = (TKey*)nextkey())) {
1072 std::string kName = key->GetClassName();
1083 isValid =
Merge(*dS);
1086 if (isValid) count++;
1090 RESTError <<
"Cannot open " << fileName <<
RESTendl;
1094 RESTError << fileName <<
" is not a valid dataSet skipping..." <<
RESTendl;
1095 it = fileNames.erase(it);
1101 if (fileNames.empty()) {
1102 RESTError <<
"File selection is empty, dataSet will not be imported " <<
RESTendl;
1106 RESTInfo <<
"Opening list of files. First file: " << fileNames[0] <<
RESTendl;
1107 fDataSet = ROOT::RDataFrame(
"AnalysisTree", fileNames);
1109 if (
fTree !=
nullptr) {
1113 fTree =
new TChain(
"AnalysisTree");
1115 for (
const auto& fN : fileNames)
fTree->Add((TString)fN);
std::vector< std::string > GetObservableNames()
It returns a vector with strings containing all the observables that exist in the analysis tree.
A class to help on cuts definitions. To be used with TRestAnalysisTree.
It allows to group a number of runs that satisfy given metadata conditions.
std::vector< std::string > fFilterContains
If not empty it will check if the metadata member contains the string.
virtual std::vector< std::string > FileSelection()
Function to determine the filenames that satisfy the dataset conditions.
std::vector< Double_t > fFilterLowerThan
If the corresponding element is not empty it will check if the metadata member is lower.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
TChain * fTree
A pointer to the generated tree.
std::vector< std::string > fProcessObservablesList
It contains a list of the process where all observables should be added.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
std::map< std::string, RelevantQuantity > fQuantity
The properties of a relevant quantity that we want to store together with the dataset.
ROOT::RDF::RNode fDataSet
The resulting RDF::RNode object after initialization.
std::vector< std::pair< std::string, std::string > > fColumnNameExpressions
A list of new columns together with its corresponding expressions added to the dataset.
ROOT::RDF::RNode DefineColumn(const std::string &columnName, const std::string &formula)
This function will add a new column to the RDataFrame using the same scheme as the usual RDF::Define ...
Double_t fEndTime
TimeStamp for the end time of the last file.
size_t GetNumberOfBranches()
Number of variables (or observables)
TRestDataSet()
Default constructor.
Double_t GetTotalTimeInSeconds() const
It returns the accumulated run time in seconds.
ROOT::RDF::RNode MakeCut(const TRestCut *cut)
This function applies a TRestCut to the dataframe and returns a dataframe with the applied cuts....
void GenerateDataSet()
This function generates the data frame with the filelist and column names (or observables) that have ...
Bool_t fMT
A flag to enable Multithreading during dataframe generation.
TRestCut * fCut
Parameter cuts over the selected dataset.
void Export(const std::string &filename, std::vector< std::string > excludeColumns={})
It will generate an output file with the dataset compilation. Only the selected branches and the file...
std::string fFilterStartTime
All the selected runs will have a starting date after fStartTime.
Bool_t Merge(const TRestDataSet &dS)
This function merge different TRestDataSet metadata in current dataSet.
std::vector< std::string > fFilterMetadata
A list of metadata members where filters will be applied.
std::vector< std::string > fFileSelection
A list populated by the FileSelection method using the conditions of the dataset.
std::vector< std::string > GetFileSelection()
It returns a list of the files that have been finally selected.
std::string fFilterEndTime
All the selected runs will have an ending date before fEndTime.
Double_t fStartTime
TimeStamp for the start time of the first file.
std::vector< std::string > fObservablesList
It contains a list of the observables that will be added to the final tree or exported file.
Bool_t fMergedDataset
It keeps track if the generated dataset is a pure dataset or a merged one.
void Initialize() override
This function initialize different parameters from the TRestDataSet.
std::vector< std::string > fImportedFiles
The list of dataset files imported.
Double_t fTotalDuration
The total integrated run time of selected files.
std::string fFilePattern
A glob file pattern that must be satisfied by all files.
std::vector< Double_t > fFilterGreaterThan
If the corresponding element is not empty it will check if the metadata member is greater.
std::vector< Double_t > fFilterEqualsTo
If the corresponding element is not empty it will check if the metadata member is equal.
void InitFromConfigFile() override
Initialization of specific TRestDataSet members through an RML file.
TRestDataSet & operator=(TRestDataSet &dS)
Operator to copy TRestDataSet metadata.
~TRestDataSet()
Default destructor.
Data provider and manager in REST.
std::string ReplaceMetadataMembers(const std::string &instr, Int_t precision=8)
It will replace the data members contained inside the string given as input. The data members in the ...
@ REST_Info
+show most of the information for each steps
TClass * GetClassQuick(std::string type)
time_t StringToTimeStamp(std::string time)
A method to convert a date/time formatted string to a timestamp.
std::vector< std::string > Split(std::string in, std::string separator, bool allowBlankString=false, bool removeWhiteSpaces=false, int startPos=-1)
Split the input string according to the given separator. Returning a vector of fragments.
Double_t StringToDouble(std::string in)
Gets a double from a string.
std::string ToDateTimeString(time_t time)
Format time_t into string.
std::string Replace(std::string in, std::string thisString, std::string byThisString, size_t fromPosition=0, Int_t N=0)
Replace any occurences of thisSring by byThisString inside string in.
std::string metadata
The associated metadata member used to register the relevant quantity.
std::string description
A user given description that can be used to define the relevant quantity.
std::string strategy
It determines how to produce the relevant quantity (accumulate/unique/last/max/min)
std::string value
The quantity value.