REST-for-Physics  v2.3
Rare Event Searches ToolKit for Physics
TRestComponentDataSet.cxx
1/*************************************************************************
2 * This file is part of the REST software framework. *
3 * *
4 * Copyright (C) 2016 GIFNA/TREX (University of Zaragoza) *
5 * For more information see https://gifna.unizar.es/trex *
6 * *
7 * REST is free software: you can redistribute it and/or modify *
8 * it under the terms of the GNU General Public License as published by *
9 * the Free Software Foundation, either version 3 of the License, or *
10 * (at your option) any later version. *
11 * *
12 * REST is distributed in the hope that it will be useful, *
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15 * GNU General Public License for more details. *
16 * *
17 * You should have a copy of the GNU General Public License along with *
18 * REST in $REST_PATH/LICENSE. *
19 * If not, see https://www.gnu.org/licenses/. *
20 * For the list of contributors see $REST_PATH/CREDITS. *
21 *************************************************************************/
22
85#include "TRestComponentDataSet.h"
86
87#include <TKey.h>
88
89#include <numeric>
90
91ClassImp(TRestComponentDataSet);
92
97
102
117TRestComponentDataSet::TRestComponentDataSet(const char* cfgFileName, const std::string& name)
118 : TRestComponent(cfgFileName) {
120
122}
123
130
131 SetSectionName(this->ClassName());
132
133 LoadDataSets();
134}
135
141
142 if (!fDataSetFileNames.empty()) {
143 RESTMetadata << " " << RESTendl;
144 RESTMetadata << " == Dataset filenames ==" << RESTendl;
145
146 for (const auto& x : fDataSetFileNames) RESTMetadata << "- " << x << RESTendl;
147
148 RESTMetadata << " " << RESTendl;
149 }
150
151 if (!fParameter.empty() && fParameterizationNodes.empty()) {
152 RESTMetadata << "This component has no nodes!" << RESTendl;
153 RESTMetadata << " Use: LoadDataSets() to initialize the nodes" << RESTendl;
154 }
155
156 if (!fWeights.empty()) {
157 RESTMetadata << " " << RESTendl;
158 RESTMetadata << " == Weights ==" << RESTendl;
159
160 for (const auto& x : fWeights) RESTMetadata << "- " << x << RESTendl;
161
162 RESTMetadata << " " << RESTendl;
163 }
164
165 RESTMetadata << " Use : PrintStatistics() to check node statistics" << RESTendl;
166 RESTMetadata << "----" << RESTendl;
167}
168
173 if (fNSimPerNode.empty() && IsDataSetLoaded()) fNSimPerNode = ExtractNodeStatistics();
174
175 if (!HasNodes() && !IsDataSetLoaded()) {
176 RESTWarning << "TRestComponentDataSet::PrintStatistics. Empty nodes and no dataset loaded!"
177 << RESTendl;
178 RESTWarning << "Invoking TRestComponentDataSet::Initialize() might solve the problem" << RESTendl;
179 return;
180 }
181
182 auto result = std::accumulate(fNSimPerNode.begin(), fNSimPerNode.end(), 0);
183 RESTInfo << "Total counts : " << result << RESTendl;
184 std::cout << std::endl;
185
186 RESTInfo << " Parameter node statistics (" << fParameter << ")" << RESTendl;
187 int n = 0;
188 for (const auto& p : fParameterizationNodes) {
189 RESTInfo << " - Value : " << p << " Counts: " << fNSimPerNode[n] << RESTendl;
190 n++;
191 }
192}
193
199
200 auto ele = GetElement("dataset");
201 while (ele != nullptr) {
202 fDataSetFileNames.push_back(GetParameter("filename", ele, ""));
203 ele = GetNextElement(ele);
204 }
205
206 if (!fDataSetFileNames.empty()) Initialize();
207}
208
216 if (!fNodeDensity.empty()) return;
217
218 if (fNbins.size() == 0) {
219 RESTError
220 << "TRestComponentDataSet::FillHistograms. Trying to fill histograms but no variables found!"
221 << RESTendl;
222 return;
223 }
224
226
227 if (!IsDataSetLoaded()) {
228 RESTError << "TRestComponentDataSet::FillHistograms. Dataset has not been initialized!" << RESTendl;
229 return;
230 }
231
232 if (fParameterizationNodes.empty()) {
233 RESTWarning << "Nodes have not been defined" << RESTendl;
234 RESTWarning << "The full dataset will be used to generate the density distribution" << RESTendl;
235 fParameterizationNodes.push_back(-137);
236 }
237
238 RESTInfo << "Generating N-dim histograms" << RESTendl;
239 int nIndex = 0;
240 for (const auto& node : fParameterizationNodes) {
241 Int_t from = 0;
242 Int_t to = 0;
243 if (fSamples > 0 && fTotalSamples[nIndex] - fSamples > 0) {
244 from = fRandom->Integer(fTotalSamples[nIndex] - fSamples);
245 to = from + fSamples;
246 fNSimPerNode[nIndex] = fSamples;
247 }
248
249 ROOT::RDF::RNode df = ROOT::RDataFrame(0);
252 if (fParameterizationNodes.size() == 1 && node == -137) {
253 RESTInfo << "Creating component with no parameters (full dataset used)" << RESTendl;
254 df = fDataSet.GetDataFrame().Range(from, to);
256 } else {
257 RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node)
258 << RESTendl;
259 Double_t pUp = node * (1 + fPrecision / 2);
260 Double_t pDown = node * (1 - fPrecision / 2);
261 std::string filter = fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " +
262 DoubleToString(pDown);
263 df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
264 }
265
266 Int_t* bins = new Int_t[fNbins.size()];
267 Double_t* xmin = new Double_t[fNbins.size()];
268 Double_t* xmax = new Double_t[fNbins.size()];
269
270 for (size_t n = 0; n < fNbins.size(); n++) {
271 bins[n] = fNbins[n];
272 xmin[n] = fRanges[n].X();
273 xmax[n] = fRanges[n].Y();
274 }
275
276 TString hName = fParameter + "_" + DoubleToString(node);
277 if (fParameterizationNodes.empty()) hName = "full";
278
279 std::vector<std::string> varsAndWeight = fVariables;
280
281 if (!fWeights.empty()) {
282 std::string weightsStr = "";
283 for (size_t n = 0; n < fWeights.size(); n++) {
284 if (n > 0) weightsStr += "*";
285
286 weightsStr += fWeights[n];
287 }
288 df = df.Define("componentWeight", weightsStr);
289 varsAndWeight.push_back("componentWeight");
290 }
291
292 auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
293 THnD* hNd = new THnD(*hn);
294 hNd->Scale(1. / fNSimPerNode[nIndex]);
295
296 fNodeDensity.push_back(hNd);
297 fActiveNode = nIndex;
298 nIndex++;
299 }
300}
301
309 if (fActiveNode >= 0 && fNodeDensity[fActiveNode]) {
311 } else {
312 RESTError << "TRestComponentDataSet::RegenerateActiveNode. Active node undefined!" << RESTendl;
313 return;
314 }
315
316 Int_t from = 0;
317 Int_t to = 0;
318 if (fSamples > 0 && fTotalSamples[fActiveNode] - fSamples > 0) {
319 from = fRandom->Integer(fTotalSamples[fActiveNode] - fSamples);
320 to = from + fSamples;
322 }
323
324 Double_t node = GetActiveNodeValue();
325 RESTInfo << "Creating THnD for parameter " << fParameter << ": " << DoubleToString(node) << RESTendl;
326
327 ROOT::RDF::RNode df = ROOT::RDataFrame(0);
328 Double_t pUp = node * (1 + fPrecision / 2);
329 Double_t pDown = node * (1 - fPrecision / 2);
330 std::string filter =
331 fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
332 df = fDataSet.GetDataFrame().Filter(filter).Range(from, to);
333
334 Int_t* bins = new Int_t[fNbins.size()];
335 Double_t* xmin = new Double_t[fNbins.size()];
336 Double_t* xmax = new Double_t[fNbins.size()];
337
338 for (size_t n = 0; n < fNbins.size(); n++) {
339 bins[n] = fNbins[n];
340 xmin[n] = fRanges[n].X();
341 xmax[n] = fRanges[n].Y();
342 }
343
344 TString hName = fParameter + "_" + DoubleToString(node);
345 if (fParameterizationNodes.empty()) hName = "full";
346
347 std::vector<std::string> varsAndWeight = fVariables;
348
349 if (!fWeights.empty()) {
350 std::string weightsStr = "";
351 for (size_t n = 0; n < fWeights.size(); n++) {
352 if (n > 0) weightsStr += "*";
353
354 weightsStr += fWeights[n];
355 }
356 df = df.Define("componentWeight", weightsStr);
357 varsAndWeight.push_back("componentWeight");
358 }
359
360 auto hn = df.HistoND({hName, hName, (int)fNbins.size(), bins, xmin, xmax}, varsAndWeight);
361 THnD* hNd = new THnD(*hn);
362 hNd->Scale(1. / fNSimPerNode[fActiveNode]);
363
365}
366
376
377 RESTInfo << "Extracting parameterization nodes" << RESTendl;
378
379 std::vector<double> vs;
380 if (!IsDataSetLoaded()) {
381 RESTError << "TRestComponentDataSet::ExtractParameterizationNodes. Dataset has not been initialized!"
382 << RESTendl;
383 return vs;
384 }
385
386 auto parValues = fDataSet.GetDataFrame().Take<double>(fParameter);
387 for (const auto v : parValues) vs.push_back(v);
388
389 std::vector<double>::iterator ip;
390 ip = std::unique(vs.begin(), vs.begin() + vs.size());
391 vs.resize(std::distance(vs.begin(), ip));
392 std::sort(vs.begin(), vs.end());
393 ip = std::unique(vs.begin(), vs.end());
394 vs.resize(std::distance(vs.begin(), ip));
395
396 return vs;
397}
398
410 if (!fNSimPerNode.empty()) return fNSimPerNode;
411
412 fTotalSamples.clear();
413
414 std::vector<Int_t> stats;
415 if (!IsDataSetLoaded()) {
416 RESTError << "TRestComponentDataSet::ExtractNodeStatistics. Dataset has not been initialized!"
417 << RESTendl;
418 return stats;
419 }
420
421 RESTInfo << "Counting statistics for each node ..." << RESTendl;
422 RESTInfo << "Number of nodes : " << fParameterizationNodes.size() << RESTendl;
423 for (const auto& p : fParameterizationNodes) {
424 Double_t pUp = p * (1 + fPrecision / 2);
425 Double_t pDown = p * (1 - fPrecision / 2);
426 std::string filter =
427 fParameter + " < " + DoubleToString(pUp) + " && " + fParameter + " > " + DoubleToString(pDown);
428 RESTInfo << "Counting stats for : " << fParameter << " = " << p << RESTendl;
429 auto nEv = fDataSet.GetDataFrame().Filter(filter).Count();
430 fTotalSamples.push_back(*nEv);
431 RESTInfo << "Total entries for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
432 if (fSamples != 0) {
433 nEv = fDataSet.GetDataFrame().Filter(filter).Range(fSamples).Count();
434 }
435
436 if ((Int_t)*nEv < fSamples) {
437 RESTWarning << "The number of requested samples (" << fSamples
438 << ") is higher than the number of dataset entries (" << *nEv << ")" << RESTendl;
439 }
440 RESTInfo << "Samples to be used for " << fParameter << ":" << p << " = " << *nEv << RESTendl;
441 stats.push_back(*nEv);
442 }
443 return stats;
444}
445
452 if (fDataSetFileNames.empty()) {
453 fDataSetLoaded = false;
454 return fDataSetLoaded;
455 }
456
457 RESTInfo << "Loading datasets" << RESTendl;
458
459 std::vector<std::string> fullFileNames;
460 for (const auto& name : fDataSetFileNames) {
461 // TODO we get here a list of files. However, we will need to weight each dataset with a factor
462 // to consider the contribution of each background component.
463 // Of course, we could previously take a factor into account already in the dataset, through the
464 // definition of a new column. But being this way would allow us to play around with the
465 // background model without having to regenerate the dataset.
466 std::string fileName = SearchFile(name);
467 if (fileName.empty()) {
468 RESTError << "TRestComponentDataSet::LoadDataSet. Error loading file : " << name << RESTendl;
469 RESTError << "Does the file exist?" << RESTendl;
470 RESTError << "You may use `<globals> <searchPath ...` to indicate the path location" << RESTendl;
471 return false;
472 }
473 fullFileNames.push_back(fileName);
474 }
475
476 fDataSet.Import(fullFileNames);
477 fDataSetLoaded = true;
478
479 if (fDataSet.GetTree() == nullptr) {
480 RESTError << "Problem loading dataset from file list :" << RESTendl;
481 for (const auto& f : fDataSetFileNames) RESTError << " - " << f << RESTendl;
482 return false;
483 }
484
486
487 if (VariablesOk() && WeightsOk()) {
490 return fDataSetLoaded;
491 }
492
493 return fDataSetLoaded;
494}
495
500 Bool_t ok = true;
501 std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
502
503 for (const auto& var : fVariables)
504 if (std::count(cNames.begin(), cNames.end(), var) == 0) {
505 RESTError << "Variable ---> " << var << " <--- NOT found on dataset" << RESTendl;
506 ok = false;
507 }
508 return ok;
509}
510
515 Bool_t ok = true;
516 std::vector cNames = fDataSet.GetDataFrame().GetColumnNames();
517
518 for (const auto& var : fWeights)
519 if (std::count(cNames.begin(), cNames.end(), var) == 0) {
520 RESTError << "Weight ---> " << var << " <--- NOT found on dataset" << RESTendl;
521 ok = false;
522 }
523 return ok;
524}
525
531 if (!IsDataSetLoaded()) {
532 RESTWarning << "TRestComponentDataSet::ValidDataSet. Dataset has not been loaded" << RESTendl;
533 RESTWarning << "Try calling TRestComponentDataSet::Initialize()" << RESTendl;
534
535 RESTInfo << "Trying to load datasets" << RESTendl;
536 LoadDataSets();
537 if (IsDataSetLoaded()) {
538 RESTInfo << "Sucess!" << RESTendl;
539 } else {
540 RESTError << "Failed loading datasets" << RESTendl;
541 return false;
542 }
543 }
544
545 if (HasNodes() && fActiveNode == -1) {
546 RESTError << "TRestComponentDataSet::ValidDataSet. Active node has not been defined" << RESTendl;
547 return false;
548 }
549 return true;
550}
It defines a background/signal model distribution in a given parameter space (tipically x,...
Bool_t ValidDataSet()
Takes care of initializing datasets if have not been initialized. On sucess it returns true.
TRestDataSet fDataSet
The dataset used to initialize the distribution.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Bool_t fDataSetLoaded
It is true of the dataset was loaded without issues.
void FillHistograms() override
It will produce a histogram with the distribution defined using the variables and the weights for eac...
std::vector< Int_t > fNSimPerNode
std::vector< std::string > fWeights
A list with the dataset columns used to weight the distribution density and define rate.
Bool_t LoadDataSets()
A method responsible to import a list of TRestDataSet into fDataSet and check that the variables and ...
Bool_t VariablesOk()
It returns true if all variables have been found inside TRestDataSet.
void PrintStatistics()
It prints out the statistics available for each parametric node.
TRestComponentDataSet()
Default constructor.
~TRestComponentDataSet()
Default destructor.
void RegenerateActiveNodeDensity() override
It will regenerate the density histogram for the active node. It is practical in the case when the nu...
std::vector< Int_t > fTotalSamples
It defines the total number of entries for each parameterization node (Initialized by the dataset)
std::vector< Double_t > ExtractParameterizationNodes()
It returns a vector with all the different values found on the dataset column for the user given para...
void Initialize() override
It will initialize the data frame with the filelist and column names (or observables) that have been ...
Bool_t WeightsOk()
It returns true if all weights have been found inside TRestDataSet.
std::vector< std::string > fDataSetFileNames
The filename of the dataset used.
std::vector< Int_t > ExtractNodeStatistics()
It returns a vector with the number of entries found for each parameterization node.
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
It defines a background/signal model distribution in a given parameter space (tipically x,...
void InitFromConfigFile() override
It customizes the retrieval of XML data values of this class.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestAxionSolarFlux.
Int_t fActiveNode
It is used to define the node that will be accessed for rate retrieval.
Int_t fSamples
It introduces a fixed number of samples (if 0 it will take all available samples)
std::string fParameter
It is used to parameterize a set of distribution densities (e.g. WIMP or axion mass)
Float_t fPrecision
A precision used to select the node value with a given range defined as a fraction of the value.
std::vector< Int_t > fNbins
The number of bins in which we should divide each variable.
void Initialize() override
It initializes the random number. We avoid to define the section name here since we will never define...
std::vector< TVector2 > fRanges
The range of each of the variables used to create the PDF distribution.
std::vector< Double_t > fParameterizationNodes
It defines the nodes of the parameterization (Initialized by the dataset)
Bool_t HasNodes()
It returns true if any nodes have been defined.
TRandom3 * fRandom
Internal process random generator.
std::vector< std::string > fVariables
A list with the branches that will be used to create the distribution space.
std::vector< THnD * > fNodeDensity
The generated N-dimensional variable space density for a given node.
void PrintMetadata() override
Prints on screen the information about the metadata members of TRestDataSet.
void Import(const std::string &fileName)
This function imports metadata from a root file it import metadata info from the previous dataSet whi...
ROOT::RDF::RNode GetDataFrame() const
Gives access to the RDataFrame.
Definition: TRestDataSet.h:127
TTree * GetTree() const
Gives access to the tree.
Definition: TRestDataSet.h:136
endl_t RESTendl
Termination flag object for TRestStringOutput.
TiXmlElement * GetElement(std::string eleDeclare, TiXmlElement *e=nullptr)
Get an xml element from a given parent element, according to its declaration.
Int_t LoadConfigFromFile(const std::string &configFilename, const std::string &sectionName="")
Give the file name, find out the corresponding section. Then call the main starter.
TRestStringOutput::REST_Verbose_Level GetVerboseLevel()
returns the verboselevel in type of REST_Verbose_Level enumerator
void SetSectionName(std::string sName)
set the section name, clear the section content
std::string SearchFile(std::string filename)
Search files in current directory and directories specified in "searchPath" section.
std::string fConfigFileName
Full name of the rml file.
TiXmlElement * GetNextElement(TiXmlElement *e)
Get the next sibling xml element of this element, with same eleDeclare.
std::string GetParameter(std::string parName, TiXmlElement *e, TString defaultValue=PARAMETER_NOT_FOUND_STR)
Returns the value for the parameter named parName in the given section.
@ REST_Info
+show most of the information for each steps
std::string DoubleToString(Double_t d, std::string format="%8.6e")
Gets a string from a double.