@@ -44,11 +44,13 @@ namespace {
4444 TIndexType Leaf;
4545 double Gain;
4646 TCandidateInfo BestCandidate;
47+ TVector<TBucketStats> Stats;
4748
48- TSplitLeafCandidate (TIndexType leaf, double gain, const TCandidateInfo& bestCandidate)
49+ TSplitLeafCandidate (TIndexType leaf, double gain, const TCandidateInfo& bestCandidate, TVector<TBucketStats> stats )
4950 : Leaf(leaf)
5051 , Gain(gain)
5152 , BestCandidate(bestCandidate)
53+ , Stats(stats)
5254 {}
5355
5456 bool operator <(const TSplitLeafCandidate& other) const {
@@ -1288,132 +1290,6 @@ static void SplitDocsSubset(
12881290 }
12891291}
12901292
1291- static TNonSymmetricTreeStructure GreedyTensorSearchLossguide (
1292- const TTrainingDataProviders& data,
1293- double modelLength,
1294- TProfileInfo& profile,
1295- TVector<TIndexType>* indices,
1296- TFold* fold,
1297- TLearnContext* ctx) {
1298-
1299- Y_ASSERT (IsSamplingPerTree (ctx->Params .ObliviousTreeOptions ));
1300-
1301- TNonSymmetricTreeStructure currentStructure;
1302-
1303- const ui32 learnSampleCount = data.Learn ->ObjectsData ->GetObjectCount ();
1304- TArrayRef<TIndexType> indicesRef (*indices);
1305-
1306- const double scoreStDev = CalcScoreStDev (learnSampleCount, modelLength, *fold, ctx);
1307-
1308- TPriorityQueue<TSplitLeafCandidate> queue;
1309- TVector<ui32> leafDepth (ctx->Params .ObliviousTreeOptions ->MaxLeaves );
1310- const auto findBestCandidate = [&](TIndexType leaf) {
1311- Y_DEFER { profile.AddOperation (TStringBuilder () << " Find best candidate for leaf " << leaf); };
1312- const auto leafBounds = ctx->SampledDocs .LeavesBounds [leaf];
1313- const bool needSplit = leafDepth[leaf] < ctx->Params .ObliviousTreeOptions ->MaxDepth
1314- && leafBounds.GetSize () >= ctx->Params .ObliviousTreeOptions ->MinDataInLeaf ;
1315- if (!needSplit) {
1316- return ;
1317- }
1318- auto candidatesContexts = SelectFeaturesForScoring (data, {}, fold, ctx);
1319- CalcBestScoreLeafwise (
1320- data,
1321- {leaf},
1322- TStatsForSubtractionTrick{},
1323- ctx->LearnProgress ->Rand .GenRand (),
1324- scoreStDev,
1325- &candidatesContexts,
1326- fold,
1327- ctx);
1328- const size_t maxFeatureValueCount = CalcMaxFeatureValueCount (*fold, candidatesContexts);
1329- CheckInterrupted (); // check after long-lasting operation
1330-
1331- double bestScore = MINIMAL_SCORE;
1332- const TCandidateInfo* bestSplitCandidate = nullptr ;
1333- const double scoreBeforeSplit = CalcScoreWithoutSplit (leaf, *fold, *ctx);
1334- SelectBestCandidate (data, *ctx, candidatesContexts, maxFeatureValueCount, *fold, scoreBeforeSplit, &bestScore, &bestSplitCandidate);
1335- fold->DropEmptyCTRs ();
1336- if (bestSplitCandidate == nullptr ) {
1337- return ;
1338- }
1339- const double gain = bestScore - scoreBeforeSplit;
1340- CATBOOST_DEBUG_LOG << " Best gain for leaf #" << leaf << " = " << gain << Endl;
1341- if (gain < 1e-9 ) {
1342- return ;
1343- }
1344- queue.emplace (leaf, gain, *bestSplitCandidate);
1345- };
1346- findBestCandidate (0 );
1347-
1348- TVector<TIndexedSubset<ui32>> subsetsForLeafs (ctx->Params .ObliviousTreeOptions ->MaxLeaves );
1349- subsetsForLeafs[0 ] = xrange (learnSampleCount).operator TIndexedSubset<ui32>();
1350-
1351- while (!queue.empty () && currentStructure.GetLeafCount () < ctx->Params .ObliviousTreeOptions ->MaxLeaves ) {
1352- /*
1353- * There is a problem with feature penalties calculation.
1354- * We consider a feature unused until we extracted a split with it from the queue.
1355- * Before that, all new splits are calculated with penalty for that feature.
1356- * And when first split by this feature is extracted from the queue, all other
1357- * splits in the queue should be recalculated without penalty for that feature.
1358- * However, we don't do it for performance and code simplicity.
1359- */
1360- const TSplitLeafCandidate curSplitLeaf = queue.top ();
1361- queue.pop ();
1362-
1363- const TSplit bestSplit = curSplitLeaf.BestCandidate .GetBestSplit (
1364- data,
1365- *fold,
1366- ctx->Params .CatFeatureParams ->OneHotMaxSize );
1367- if (bestSplit.Type == ESplitType::OnlineCtr) {
1368- ProcessCtrSplit (data, bestSplit, fold, ctx);
1369- }
1370-
1371- const TIndexType splittedNodeIdx = curSplitLeaf.Leaf ;
1372- MarkFeaturesAsUsed (
1373- bestSplit,
1374- subsetsForLeafs[splittedNodeIdx],
1375- ctx->LearnProgress ->EstimatedFeaturesContext ,
1376- *ctx->Layout ,
1377- &ctx->LearnProgress ->UsedFeatures ,
1378- &ctx->LearnProgress ->UsedFeaturesPerObject
1379- );
1380-
1381- const auto & node = currentStructure.AddSplit (bestSplit, curSplitLeaf.Leaf );
1382- const TIndexType leftChildIdx = ~node.Left ;
1383- const TIndexType rightChildIdx = ~node.Right ;
1384- UpdateIndices (
1385- node,
1386- data,
1387- subsetsForLeafs[splittedNodeIdx],
1388- *fold,
1389- ctx->LocalExecutor ,
1390- indicesRef
1391- );
1392-
1393- Y_ASSERT (leftChildIdx == splittedNodeIdx);
1394- TIndexedSubset<ui32> leftChildSubset, rightChildSubset;
1395- SplitDocsSubset (subsetsForLeafs[splittedNodeIdx], indicesRef, leftChildIdx, &leftChildSubset, &rightChildSubset);
1396- subsetsForLeafs[leftChildIdx] = std::move (leftChildSubset);
1397- subsetsForLeafs[rightChildIdx] = std::move (rightChildSubset);
1398-
1399- ctx->SampledDocs .UpdateIndicesInLeafwiseSortedFoldForSingleLeaf (
1400- splittedNodeIdx,
1401- leftChildIdx,
1402- rightChildIdx,
1403- *indices,
1404- ctx->LocalExecutor );
1405- const int newDepth = leafDepth[splittedNodeIdx] + 1 ;
1406- leafDepth[leftChildIdx] = newDepth;
1407- leafDepth[rightChildIdx] = newDepth;
1408- CATBOOST_DEBUG_LOG << " For node #" << splittedNodeIdx << " built childs: " << leftChildIdx << " and " << rightChildIdx
1409- << " with split: " << BuildDescription (*ctx->Layout , bestSplit) << " and score = " << curSplitLeaf.Gain << Endl;
1410- findBestCandidate (leftChildIdx);
1411- findBestCandidate (rightChildIdx);
1412- }
1413-
1414- return currentStructure;
1415- }
1416-
14171293namespace {
14181294 struct TSubtractTrickInfo {
14191295 const TTrainingDataProviders* Data;
@@ -1750,6 +1626,240 @@ static TNonSymmetricTreeStructure GreedyTensorSearchDepthwise(
17501626 return currentStructure;
17511627}
17521628
1629+ static TNonSymmetricTreeStructure GreedyTensorSearchLossguide (
1630+ const TTrainingDataProviders& data,
1631+ double modelLength,
1632+ TProfileInfo& profile,
1633+ TVector<TIndexType>* indices,
1634+ TFold* fold,
1635+ TLearnContext* ctx) {
1636+
1637+ Y_ASSERT (IsSamplingPerTree (ctx->Params .ObliviousTreeOptions ));
1638+
1639+ TNonSymmetricTreeStructure currentStructure;
1640+
1641+ const ui32 learnSampleCount = data.Learn ->ObjectsData ->GetObjectCount ();
1642+ TArrayRef<TIndexType> indicesRef (*indices);
1643+
1644+ const double scoreStDev = CalcScoreStDev (learnSampleCount, modelLength, *fold, ctx);
1645+
1646+ const bool isMultiClassOrMultiRegression = fold->GetApproxDimension () != 1 ;
1647+ const bool isSimpleRsm = ctx->Params .ObliviousTreeOptions ->Rsm == 1 .0f ;
1648+ const bool isSubtractTrickAllowed = !isMultiClassOrMultiRegression && isSimpleRsm;
1649+
1650+ TPriorityQueue<TSplitLeafCandidate> queue;
1651+ TVector<ui32> leafDepth (ctx->Params .ObliviousTreeOptions ->MaxLeaves );
1652+
1653+ const auto findBestCandidateRoot = [&](TIndexType leaf) {
1654+ Y_DEFER { profile.AddOperation (TStringBuilder () << " Find best candidate for leaf " << leaf); };
1655+ const auto leafBounds = ctx->SampledDocs .LeavesBounds [leaf];
1656+
1657+ const ui32 leafBoundsSize = leafBounds.GetSize ();
1658+
1659+ const bool needSplit = leafDepth[leaf] < ctx->Params .ObliviousTreeOptions ->MaxDepth
1660+ && leafBoundsSize >= ctx->Params .ObliviousTreeOptions ->MinDataInLeaf ;
1661+ if (!needSplit) {
1662+ return ;
1663+ }
1664+
1665+ auto candidatesContexts = SelectFeaturesForScoring (data, {}, fold, ctx);
1666+
1667+ TQueue<TVector<TBucketStats>> parentsQueue;
1668+
1669+ TSubtractTrickInfo subTrickInfo (
1670+ &data,
1671+ &candidatesContexts,
1672+ fold,
1673+ ctx,
1674+ &parentsQueue,
1675+ scoreStDev
1676+ );
1677+
1678+ const TCandidateInfo* leafBestSplitCandidate = nullptr ;
1679+ double leafGain = 0 ;
1680+ TSplit leafBestSplit;
1681+
1682+ TVector<TBucketStats> leafStats = CalculateStats (
1683+ subTrickInfo,
1684+ leaf,
1685+ &leafGain,
1686+ &leafBestSplitCandidate,
1687+ &leafBestSplit);
1688+
1689+ if (leafBestSplitCandidate != nullptr && leafGain >= 1e-9 ) {
1690+ queue.emplace (leaf, leafGain, *leafBestSplitCandidate, leafStats);
1691+ }
1692+ };
1693+
1694+ const auto findBestCandidate = [&](TIndexType leftLeaf, TIndexType rightLeaf, TSplitLeafCandidate parent) {
1695+ Y_DEFER { profile.AddOperation (TStringBuilder () << " Find best candidate for left leaf " << leftLeaf); };
1696+ Y_DEFER { profile.AddOperation (TStringBuilder () << " Find best candidate for right leaf " << rightLeaf); };
1697+
1698+ const auto leftLeafBounds = ctx->SampledDocs .LeavesBounds [leftLeaf];
1699+ const auto rightLeafBounds = ctx->SampledDocs .LeavesBounds [rightLeaf];
1700+
1701+ const ui32 leftLeafBoundsSize = leftLeafBounds.GetSize ();
1702+ const ui32 rightLeafBoundsSize = rightLeafBounds.GetSize ();
1703+ const bool isNextLeafConsidered = rightLeafBoundsSize >= ctx->Params .ObliviousTreeOptions ->MinDataInLeaf ;
1704+
1705+ const bool needSplitLeftLeaf = leafDepth[leftLeaf] < ctx->Params .ObliviousTreeOptions ->MaxDepth
1706+ && leftLeafBoundsSize >= ctx->Params .ObliviousTreeOptions ->MinDataInLeaf ;
1707+ const bool needSplitRightLeaf = leafDepth[rightLeaf] < ctx->Params .ObliviousTreeOptions ->MaxDepth
1708+ && rightLeafBoundsSize >= ctx->Params .ObliviousTreeOptions ->MinDataInLeaf ;
1709+ if (!needSplitLeftLeaf && !needSplitRightLeaf) {
1710+ return ;
1711+ }
1712+ auto candidatesContexts = SelectFeaturesForScoring (data, {}, fold, ctx);
1713+
1714+ TQueue<TVector<TBucketStats>> parentsQueue;
1715+ ConditionalPushToParentsQueue (parent.Gain , &parent.BestCandidate , std::move (parent.Stats ), &parentsQueue);
1716+
1717+ TSubtractTrickInfo subTrickInfo (
1718+ &data,
1719+ &candidatesContexts,
1720+ fold,
1721+ ctx,
1722+ &parentsQueue,
1723+ scoreStDev
1724+ );
1725+
1726+ const TCandidateInfo* leftLeafBestSplitCandidate = nullptr ;
1727+ double leftLeafGain = 0 ;
1728+ TSplit leftLeafBestSplit;
1729+
1730+ const TCandidateInfo* rightLeafBestSplitCandidate = nullptr ;
1731+ double rightLeafGain = 0 ;
1732+ TSplit rightLeafBestSplit;
1733+
1734+ TVector<TBucketStats> leftLeafStats;
1735+ TVector<TBucketStats> rightLeafStats;
1736+
1737+ if ((leftLeafBoundsSize <= rightLeafBoundsSize) && isSubtractTrickAllowed && needSplitLeftLeaf && needSplitRightLeaf) {
1738+ leftLeafStats = CalculateStats (
1739+ subTrickInfo,
1740+ leftLeaf,
1741+ &leftLeafGain,
1742+ &leftLeafBestSplitCandidate,
1743+ &leftLeafBestSplit);
1744+ rightLeafStats = CalculateWithSubtractTrick (
1745+ subTrickInfo,
1746+ rightLeaf,
1747+ leftLeafStats,
1748+ &rightLeafGain,
1749+ &rightLeafBestSplitCandidate,
1750+ &rightLeafBestSplit);
1751+ } else if ((leftLeafBoundsSize > rightLeafBoundsSize) && isSubtractTrickAllowed && needSplitLeftLeaf && needSplitRightLeaf) {
1752+ rightLeafStats = CalculateStats (
1753+ subTrickInfo,
1754+ rightLeaf,
1755+ &rightLeafGain,
1756+ &rightLeafBestSplitCandidate,
1757+ &rightLeafBestSplit);
1758+ leftLeafStats = CalculateWithSubtractTrick (
1759+ subTrickInfo,
1760+ leftLeaf,
1761+ rightLeafStats,
1762+ &leftLeafGain,
1763+ &leftLeafBestSplitCandidate,
1764+ &leftLeafBestSplit);
1765+ } else {
1766+ if (needSplitLeftLeaf) {
1767+ leftLeafStats = CalculateStats (
1768+ subTrickInfo,
1769+ leftLeaf,
1770+ &leftLeafGain,
1771+ &leftLeafBestSplitCandidate,
1772+ &leftLeafBestSplit);
1773+ }
1774+
1775+ if (needSplitRightLeaf) {
1776+ rightLeafStats = CalculateStats (
1777+ subTrickInfo,
1778+ rightLeaf,
1779+ &rightLeafGain,
1780+ &rightLeafBestSplitCandidate,
1781+ &rightLeafBestSplit);
1782+ }
1783+ }
1784+
1785+ if (needSplitLeftLeaf && leftLeafBestSplitCandidate != nullptr && leftLeafGain >= 1e-9 ) {
1786+ queue.emplace (leftLeaf, leftLeafGain, *leftLeafBestSplitCandidate, leftLeafStats);
1787+ }
1788+
1789+ if (needSplitRightLeaf && rightLeafBestSplitCandidate != nullptr && rightLeafGain >= 1e-9 ) {
1790+ queue.emplace (rightLeaf, rightLeafGain, *rightLeafBestSplitCandidate, rightLeafStats);
1791+ }
1792+ };
1793+ findBestCandidateRoot (0 );
1794+
1795+ TVector<TIndexedSubset<ui32>> subsetsForLeafs (ctx->Params .ObliviousTreeOptions ->MaxLeaves );
1796+ subsetsForLeafs[0 ] = xrange (learnSampleCount).operator TIndexedSubset<ui32>();
1797+
1798+ while (!queue.empty () && currentStructure.GetLeafCount () < ctx->Params .ObliviousTreeOptions ->MaxLeaves ) {
1799+ /*
1800+ * There is a problem with feature penalties calculation.
1801+ * We consider a feature unused until we extracted a split with it from the queue.
1802+ * Before that, all new splits are calculated with penalty for that feature.
1803+ * And when first split by this feature is extracted from the queue, all other
1804+ * splits in the queue should be recalculated without penalty for that feature.
1805+ * However, we don't do it for performance and code simplicity.
1806+ */
1807+ const TSplitLeafCandidate curSplitLeaf = queue.top ();
1808+ queue.pop ();
1809+
1810+ const TSplit bestSplit = curSplitLeaf.BestCandidate .GetBestSplit (
1811+ data,
1812+ *fold,
1813+ ctx->Params .CatFeatureParams ->OneHotMaxSize );
1814+ if (bestSplit.Type == ESplitType::OnlineCtr) {
1815+ ProcessCtrSplit (data, bestSplit, fold, ctx);
1816+ }
1817+
1818+ const TIndexType splittedNodeIdx = curSplitLeaf.Leaf ;
1819+ MarkFeaturesAsUsed (
1820+ bestSplit,
1821+ subsetsForLeafs[splittedNodeIdx],
1822+ ctx->LearnProgress ->EstimatedFeaturesContext ,
1823+ *ctx->Layout ,
1824+ &ctx->LearnProgress ->UsedFeatures ,
1825+ &ctx->LearnProgress ->UsedFeaturesPerObject
1826+ );
1827+
1828+ const auto & node = currentStructure.AddSplit (bestSplit, curSplitLeaf.Leaf );
1829+ const TIndexType leftChildIdx = ~node.Left ;
1830+ const TIndexType rightChildIdx = ~node.Right ;
1831+ UpdateIndices (
1832+ node,
1833+ data,
1834+ subsetsForLeafs[splittedNodeIdx],
1835+ *fold,
1836+ ctx->LocalExecutor ,
1837+ indicesRef
1838+ );
1839+
1840+ Y_ASSERT (leftChildIdx == splittedNodeIdx);
1841+ TIndexedSubset<ui32> leftChildSubset, rightChildSubset;
1842+ SplitDocsSubset (subsetsForLeafs[splittedNodeIdx], indicesRef, leftChildIdx, &leftChildSubset, &rightChildSubset);
1843+ subsetsForLeafs[leftChildIdx] = std::move (leftChildSubset);
1844+ subsetsForLeafs[rightChildIdx] = std::move (rightChildSubset);
1845+
1846+ ctx->SampledDocs .UpdateIndicesInLeafwiseSortedFoldForSingleLeaf (
1847+ splittedNodeIdx,
1848+ leftChildIdx,
1849+ rightChildIdx,
1850+ *indices,
1851+ ctx->LocalExecutor );
1852+ const int newDepth = leafDepth[splittedNodeIdx] + 1 ;
1853+ leafDepth[leftChildIdx] = newDepth;
1854+ leafDepth[rightChildIdx] = newDepth;
1855+ CATBOOST_DEBUG_LOG << " For node #" << splittedNodeIdx << " built childs: " << leftChildIdx << " and " << rightChildIdx
1856+ << " with split: " << BuildDescription (*ctx->Layout , bestSplit) << " and score = " << curSplitLeaf.Gain << Endl;
1857+ findBestCandidate (leftChildIdx, rightChildIdx, curSplitLeaf);
1858+ }
1859+
1860+ return currentStructure;
1861+ }
1862+
17531863
17541864void GreedyTensorSearch (
17551865 const TTrainingDataProviders& data,
0 commit comments