REST-for-Physics  v2.3
Rare Event Searches ToolKit for Physics
TRestComponentDataSet.cxx
1/*************************************************************************
2 * This file is part of the REST software framework. *
3 * *
4 * Copyright (C) 2016 GIFNA/TREX (University of Zaragoza) *
5 * For more information see https://gifna.unizar.es/trex *
6 * *
7 * REST is free software: you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation, either version 3 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * REST is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have a copy of the GNU General Public License along with *
18 * REST in $REST_PATH/LICENSE. *
19 * If not, see https://www.gnu.org/licenses/. *
20 * For the list of contributors see $REST_PATH/CREDITS. *
21 *************************************************************************/
22
85#include "TRestComponentDataSet.h"
86
87#include <TKey.h>
88
89#include <numeric>
90
91ClassImp(TRestComponentDataSet);
92
97
102
117TRestComponentDataSet::TRestComponentDataSet(const char* cfgFileName, const std::string& name)
118 : TRestComponent(cfgFileName) {
120
122}
123
130
131 SetSectionName(this->ClassName());
132
133 LoadDataSets();
134}
135
141
142 if (!fDataSetFileNames.empty()) {
143 RESTMetadata << " " << RESTendl;
144 RESTMetadata << " == Dataset filenames ==" << RESTendl;
145
146 for (const auto& x : fDataSetFileNames) RESTMetadata << "- " << x << RESTendl;
147
148 RESTMetadata << " " << RESTendl;
149 }
150
151 if (fDFRange.X() != 0 || fDFRange.Y() != 0) {
152 RESTMetadata << " DataFrame range: ( " << fDFRange.X() << ", " << fDFRange.Y() << ")" << RESTendl;
153 RESTMetadata << " " << RESTendl;
154 }
155
156 if (!fParameter.empty() && fParameterizationNodes.empty()) {
157 RESTMetadata << "This component has no nodes!" << RESTendl;
158 RESTMetadata << " Use: LoadDataSets() to initialize the nodes" << RESTendl;
159 }
160
161 if (!fWeights.empty()) {
162 RESTMetadata << " " << RESTendl;
163 RESTMetadata << " == Weights ==" << RESTendl;
164
165 for (const auto& x : fWeights) RESTMetadata << "- " << x << RESTendl;
166
167 RESTMetadata << " " << RESTendl;
168 }
169
170 RESTMetadata << " Use : PrintStatistics() to check node statistics" << RESTendl;
171 RESTMetadata << "----" << RESTendl;
172}
173
178 if (fNSimPerNode.empty() && IsDataSetLoaded()) fNSimPerNode = ExtractNodeStatistics();
179
180 if (!HasNodes() && !IsDataSetLoaded()) {
181 RESTWarning << "TRestComponentDataSet::PrintStatistics. Empty nodes and no dataset loaded!"
182 << RESTendl;
183 RESTWarning << "Invoking TRestComponentDataSet::Initialize() might solve the problem" << RESTendl;
184 return;
185 }
186
187 auto result = std::accumulate(fNSimPerNode.begin(), fNSimPerNode.end(), 0);
188 RESTInfo << "Total counts : " << result << RESTendl;
189 std::cout << std::endl;
190
191 RESTInfo << " Parameter node statistics (" << fParameter << ")" << RESTendl;
192 int n = 0;
193 for (const auto& p : fParameterizationNodes) {
194 RESTInfo << " - Value : " << p << " Counts: " << fNSimPerNode[n] << RESTendl;
195 n++;
196 }
197}
198
204
205 auto ele = GetElement("dataset");
206 while (ele != nullptr) {
207 fDataSetFileNames.push_back(GetParameter("filename", ele, ""));
208 ele = GetNextElement(ele);
209 }
210
211 if (!fDataSetFileNames.empty()) Initialize();
212}
213
221 if (!fNodeDensity.empty()) return;
222
223 if (fNbins.size() == 0) {
224 RESTError
225 << "TRestComponentDataSet::FillHistograms. Trying to fill histograms but no variables found!"
226 << RESTendl;
227 return;
228 }
229
231
232 if (!IsDataSetLoaded()) {
233 RESTError << "TRestComponentDataSet::FillHistograms. Dataset has not been initialized!" << RESTendl;
234 return;
235 }
236
237 if (fParameterizationNodes.empty()) {
238 RESTWarning << "Nodes have not been defined" << RESTendl;
239 RESTWarning << "The full dataset will be used to generate the density distribution" << RESTendl;
240 fParameterizationNodes.push_back(-137);
241 }
242
243 RESTInfo << "Generating N-dim histograms" << RESTendl;
244 int nIndex = 0;
245 for (const auto& node : fParameterizationNodes) {
246 Int_t from = 0;
247 Int_t to = 0;
248 if (fSamples > 0 && fTotalSamples[nIndex] - fSamples > 0) {
249 from = fRandom->Integer(fTotalSamples[nIndex] - fSamples);
250 to = from + fSamples;
251 fNSimPerNode[nIndex] = fSamples;
252 }
253
254 ROOT::RDF::RNode df = ROOT::RDataFrame(0);
257 if (fParameterizationNodes.size() == 1 && node == -137) {
258 RESTInfo << "Creating component with no parameters (full dataset used)" << RESTendl;
259 df = fDataSet.GetDataFrame().Range(from, to);
261 } else {
262 RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node)
263 << RESTendl;
264 Double_t pUp = node * (1 + fPrecision / 2);
265 Double_t pDown = node * (1 - fPrecision / 2);
266 std::string filter = fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " +
267 DoubleToString(pDown);
268 df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
269 }
270
271 Int_t* bins = new Int_t[fNbins.size()];
272 Double_t* xmin = new Double_t[fNbins.size()];
273 Double_t* xmax = new Double_t[fNbins.size()];
274
275 for (size_t n = 0; n < fNbins.size(); n++) {
276 bins[n] = fNbins[n];
277 xmin[n] = fRanges[n].X();
278 xmax[n] = fRanges[n].Y();
279 }
280
281 TString hName = fParameter + "_" + DoubleToString(node);
282 if (fParameterizationNodes.empty()) hName = "full";
283
284 std::vector<std::string> varsAndWeight = fVariables;
285
286 if (!fWeights.empty()) {
287 std::string weightsStr = "";
288 for (size_t n = 0; n < fWeights.size(); n++) {
289 if (n > 0) weightsStr += "*";
290
291 weightsStr += fWeights[n];
292 }
293 df = df.Define("componentWeight", weightsStr);
294 varsAndWeight.push_back("componentWeight");
295 }
296
297 auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
298 THnD* hNd = new THnD(*hn);
299 hNd->Scale(1. / fNSimPerNode[nIndex]);
300
301 fNodeDensity.push_back(hNd);
302 fActiveNode = nIndex;
303 nIndex++;
304 }
305}
306
314 if (fActiveNode >= 0 && fNodeDensity[fActiveNode]) {
316 } else {
317 RESTError << "TRestComponentDataSet::RegenerateActiveNode. Active node undefined!" << RESTendl;
318 return;
319 }
320
321 Int_t from = 0;
322 Int_t to = 0;
323 if (fSamples > 0 && fTotalSamples[fActiveNode] - fSamples > 0) {
324 from = fRandom->Integer(fTotalSamples[fActiveNode] - fSamples);
325 to = from + fSamples;
327 }
328
329 Double_t node = GetActiveNodeValue();
330 RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node) << RESTendl;
331
332 ROOT::RDF::RNode df = ROOT::RDataFrame(0);
333 Double_t pUp = node * (1 + fPrecision / 2);
334 Double_t pDown = node * (1 - fPrecision / 2);
335 std::string filter =
336 fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
337 df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
338
339 Int_t* bins = new Int_t[fNbins.size()];
340 Double_t* xmin = new Double_t[fNbins.size()];
341 Double_t* xmax = new Double_t[fNbins.size()];
342
343 for (size_t n = 0; n < fNbins.size(); n++) {
344 bins[n] = fNbins[n];
345 xmin[n] = fRanges[n].X();
346 xmax[n] = fRanges[n].Y();
347 }
348
349 TString hName = fParameter + "_" + DoubleToString(node);
350 if (fParameterizationNodes.empty()) hName = "full";
351
352 std::vector<std::string> varsAndWeight = fVariables;
353
354 if (!fWeights.empty()) {
355 std::string weightsStr = "";
356 for (size_t n = 0; n < fWeights.size(); n++) {
357 if (n > 0) weightsStr += "*";
358
359 weightsStr += fWeights[n];
360 }
361 df = df.Define("componentWeight", weightsStr);
362 varsAndWeight.push_back("componentWeight");
363 }
364
365 auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
366 THnD* hNd = new THnD(*hn);
367 hNd->Scale(1. / fNSimPerNode[fActiveNode]);
368
370}
371
381
382 RESTInfo << "Extracting parameterization nodes" << RESTendl;
383
384 std::vector<double> vs;
385 if (!IsDataSetLoaded()) {
386 RESTError << "TRestComponentDataSet::ExtractParameterizationNodes. Dataset has not been initialized!"
387 << RESTendl;
388 return vs;
389 }
390
391 auto GetUniqueElements = [](const std::vector<double>& vec) {
392 std::set<double> uniqueSet(vec.begin(), vec.end());
393 return std::vector<double>(uniqueSet.begin(), uniqueSet.end());
394 };
395
396 for (size_t n = 0; n < 1 + fDataSet.GetEntries() / fSplitEntries; n++) {
397 auto nEn = fDataSet.Range(n * fSplitEntries, (n + 1) * fSplitEntries).Count();
398 auto parValues = fDataSet.Range(n * fSplitEntries, (n + 1) * fSplitEntries).Take<double>(fParameter);
399 std::vector<double> uniqueVec = GetUniqueElements(*parValues);
400 vs.insert(vs.end(), uniqueVec.begin(), uniqueVec.end());
401 }
402
403 return vs;
404}
405
417 if (!fNSimPerNode.empty()) return fNSimPerNode;
418
419 fTotalSamples.clear();
420
421 std::vector<Int_t> stats;
422 if (!IsDataSetLoaded()) {
423 RESTError << "TRestComponentDataSet::ExtractNodeStatistics. Dataset has not been initialized!"
424 << RESTendl;
425 return stats;
426 }
427
428 RESTInfo << "Counting statistics for each node ..." << RESTendl;
429 RESTInfo << "Number of nodes : " << fParameterizationNodes.size() << RESTendl;
430 for (const auto& p : fParameterizationNodes) {
431 Double_t pUp = p * (1 + fPrecision / 2);
432 Double_t pDown = p * (1 - fPrecision / 2);
433 std::string filter =
434 fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
435 RESTInfo << "Counting stats for : " << fParameter << " = " << p << RESTendl;
436 auto nEv = fDataSet.GetDataFrame().Filter(filter).Count();
437 fTotalSamples.push_back(*nEv);
438 RESTInfo << "Total entries for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
439 if (fSamples != 0) {
440 nEv = fDataSet.GetDataFrame().Filter(filter).Range(fSamples).Count();
441 }
442
443 if ((Int_t)*nEv < fSamples) {
444 RESTWarning << "The number of requested samples (" << fSamples
445 << ") is higher than the number of dataset entries (" << *nEv << ")" << RESTendl;
446 }
447 RESTInfo << "Samples to be used for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
448 stats.push_back(*nEv);
449 }
450 return stats;
451}
452
459 if (fDataSetFileNames.empty()) {
460 fDataSetLoaded = false;
461 return fDataSetLoaded;
462 }
463
464 RESTInfo << "Loading datasets" << RESTendl;
465
466 std::vector<std::string> fullFileNames;
467 for (const auto& name : fDataSetFileNames) {
468 // TODO we get here a list of files. However, we will need to weight each dataset with a factor
469 // to consider the contribution of each background component.
470 // Of course, we could previously take a factor into account already in the dataset, through the
471 // definition of a new column. But being this way would allow us to play around with the
472 // background model without having to regenerate the dataset.
473 std::string fileName = SearchFile(name);
474 if (fileName.empty()) {
475 RESTError << "TRestComponentDataSet::LoadDataSet. Error loading file : " << name << RESTendl;
476 RESTError << "Does the file exist?" << RESTendl;
477 RESTError << "You may use `<globals> <searchPath ...` to indicate the path location" << RESTendl;
478 return false;
479 }
480 fullFileNames.push_back(fileName);
481 }
482
483 fDataSet.Import(fullFileNames);
484 fDataSetLoaded = true;
485
486 if (fDFRange.X() != 0 || fDFRange.Y() != 0)
487 fDataSet.ApplyRange((size_t)fDFRange.X(), (size_t)fDFRange.Y());
488
489 if (fDataSet.GetTree() == nullptr) {
490 RESTError << "Problem loading dataset from file list :" << RESTendl;
491 for (const auto& f : fDataSetFileNames) RESTError << " - " << f << RESTendl;
492 return false;
493 }
494
496
497 if (VariablesOk() && WeightsOk()) {
499 RESTInfo << "Filling histograms" << RESTendl;
501 return fDataSetLoaded;
502 }
503
504 return fDataSetLoaded;
505}
506
511 Bool_t ok = true;
512 std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
513
514 for (const auto& var : fVariables)
515 if (std::count(cNames.begin(), cNames.end(), var) == 0) {
516 RESTError << "Variable ---> " << var << " <--- NOT found on dataset" << RESTendl;
517 ok = false;
518 }
519 return ok;
520}
521
526 Bool_t ok = true;
527 std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
528
529 for (const auto& var : fWeights) {
530 if (!isANumber(var) && std::count(cNames.begin(), cNames.end(), var) == 0) {
531 RESTError << "Weight ---> " << var << " <--- NOT found on dataset" << RESTendl;
532 ok = false;
533 }
534 }
535 return ok;
536}
537
543 if (!IsDataSetLoaded()) {
544 RESTWarning << "TRestComponentDataSet::ValidDataSet. Dataset has not been loaded" << RESTendl;
545 RESTWarning << "Try calling TRestComponentDataSet::Initialize()" << RESTendl;
546
547 RESTInfo << "Trying to load datasets" << RESTendl;
548 LoadDataSets();
549 if (IsDataSetLoaded()) {
550 RESTInfo << "Sucess!" << RESTendl;
551 } else {
552 RESTError << "Failed loading datasets" << RESTendl;
553 return false;
554 }
555 }
556
557 if (HasNodes() && fActiveNode == -1) {
558 RESTError << "TRestComponentDataSet::ValidDataSet. Active node has not been defined" << RESTendl;
559 return false;
560 }
561 return true;
562}
It defines a background/signal model distribution in a given parameter space (tipically x,...
long long unsigned int fSplitEntries
It helps to split large datasets when extracting the parameterization nodes.
Bool_t ValidDataSet()
Takes care of initializing datasets if have not been initialized. On sucess it returns true.
TRestDataSet fDataSet
The dataset used to initialize the distribution.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Bool_t fDataSetLoaded
It is true of the dataset was loaded without issues.
TVector2 fDFRange
It creates a sample subset using a range definition.
void FillHistograms() override
It will produce a histogram with the distribution defined using the variables and the weights for eac...
std::vector< Int_t > fNSimPerNode
std::vector< std::string > fWeights
A list with the dataset columns used to weight the distribution density and define rate.
Bool_t LoadDataSets()
A method responsible to import a list of TRestDataSet into fDataSet and check that the variables and ...
Bool_t VariablesOk()
It returns true if all variables have been found inside TRestDataSet.
void PrintStatistics()
It prints out the statistics available for each parametric node.
TRestComponentDataSet()
Default constructor.
~TRestComponentDataSet()
Default destructor.
void RegenerateActiveNodeDensity() override
It will regenerate the density histogram for the active node. It is practical in the case when the nu...
std::vector< Int_t > fTotalSamples
It defines the total number of entries for each parameterization node (Initialized by the dataset)
std::vector< Double_t > ExtractParameterizationNodes()
It returns a vector with all the different values found on the dataset column for the user given para...
void Initialize() override
It will initialize the data frame with the filelist and column names (or observables) that have been ...
Bool_t WeightsOk()
It returns true if all weights have been found inside TRestDataSet.
std::vector< std::string > fDataSetFileNames
The filename of the dataset used.
std::vector< Int_t > ExtractNodeStatistics()
It returns a vector with the number of entries found for each parameterization node.
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
It defines a background/signal model distribution in a given parameter space (tipically x,...
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Int_t fActiveNode
It is used to define the node that will be accessed for rate retrieval.
Int_t fSamples
It introduces a fixed number of samples (if 0 it will take all available samples)
std::string fParameter
It is used to parameterize a set of distribution densities (e.g. WIMP or axion mass)
Float_t fPrecision
A precision used to select the node value with a given range defined as a fraction of the value.
std::vector< Int_t > fNbins
The number of bins in which we should divide each variable.
void Initialize() override
It initializes the random number. We avoid to define the section name here since we will never define...
std::vector< TVector2 > fRanges
The range of each of the variables used to create the PDF distribution.
std::vector< Double_t > fParameterizationNodes
It defines the nodes of the parameterization (Initialized by the dataset)
Bool_t HasNodes()
It returns true if any nodes have been defined.
TRandom3 * fRandom
Internal process random generator.
std::vector< std::string > fVariables
A list with the branches that will be used to create the distribution space.
std::vector< THnD * > fNodeDensity
The generated N-dimensional variable space density for a given node.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
ROOT::RDF::RNode Range(size_t from, size_t to)
This method returns a RDataFrame node with the number of samples inside the dataset by selecting a ra...
size_t GetEntries()
It returns the number of entries found inside fDataFrame and prints out a warning if the number of en...
ROOT::RDF::RNode GetDataFrame() const
Gives access to the RDataFrame.
Definition: TRestDataSet.h:129
TTree * GetTree() const
Gives access to the tree.
Definition: TRestDataSet.h:138
ROOT::RDF::RNode ApplyRange(size_t from, size_t to)
This method reduces the number of samples inside the dataset by selecting a range.
endl_t RESTendl
Termination flag object for TRestStringOutput.
TiXmlElement * GetElement(std::string eleDeclare, TiXmlElement *e=nullptr)
Get an xml element from a given parent element, according to its declaration.
Int_t LoadConfigFromFile(const std::string &configFilename, const std::string &sectionName="")
Give the file name, find out the corresponding section. Then call the main starter.
TRestStringOutput::REST_Verbose_Level GetVerboseLevel()
returns the verboselevel in type of REST_Verbose_Level enumerator
void SetSectionName(std::string sName)
set the section name, clear the section content
std::string SearchFile(std::string filename)
Search files in current directory and directories specified in "searchPath" section.
std::string fConfigFileName
Full name of the rml file.
TiXmlElement * GetNextElement(TiXmlElement *e)
Get the next sibling xml element of this element, with same eleDeclare.
std::string GetParameter(std::string parName, TiXmlElement *e, TString defaultValue=PARAMETER_NOT_FOUND_STR)
Returns the value for the parameter named parName in the given section.
@ REST_Info
+show most of the information for each steps
std::string DoubleToString(Double_t d, std::string format="%8.6e")
Gets a string from a double.
Int_t isANumber(std::string in)
Returns 1 only if a valid number is found in the string in. If not it returns 0.