scancode-toolkit/src/licensedcode/detection.py at develop · aboutcode-org/scancode-toolkit

History

2244 lines (1845 loc) · 79.4 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# ScanCode is a trademark of nexB Inc.

# SPDX-License-Identifier: Apache-2.0

# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.

# See https://github.com/nexB/scancode-toolkit for support or download.

# See https://aboutcode.org for more information about nexB OSS projects.

import posixpath

import sys

import os

import logging

import typing

import uuid

from enum import Enum

from hashlib import sha1

import attr

from collections import defaultdict

from license_expression import combine_expressions

from license_expression import Licensing

from commoncode.resource import clean_path

from commoncode.text import python_safe_name

from commoncode.fileutils import as_posixpath

from licensedcode.cache import build_spdx_license_expression

from licensedcode.cache import get_cache

from licensedcode.cache import get_index

from licensedcode.cache import get_licensing

from licensedcode.match import LicenseMatch

from licensedcode.match import set_matched_lines

from licensedcode.models import compute_relevance

from licensedcode.models import Rule

from licensedcode.models import UnDetectedRule

from licensedcode.query import LINES_THRESHOLD

from licensedcode.query import Query

from licensedcode.spans import Span

from licensedcode.tokenize import query_tokenizer

from summarycode.classify import check_is_path_community_file

"""

LicenseDetection data structure and processing.

A LicenseDetection combines one or more matches together using various rules and

heuristics.

"""

TRACE = os.environ.get('SCANCODE_DEBUG_LICENSE_DETECTION', False)

TRACE_REFERENCE = os.environ.get('SCANCODE_DEBUG_PLUGIN_LICENSE_REFERENCE', False)

TRACE_ANALYSIS = False

TRACE_IS_FUNCTIONS = False

def logger_debug(*args):

pass

logger = logging.getLogger(__name__)

if TRACE:

if (

TRACE

or TRACE_ANALYSIS

or TRACE_IS_FUNCTIONS

logging.basicConfig(stream=sys.stdout)

logger.setLevel(logging.DEBUG)

def logger_debug(*args):

return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCHER_UNDETECTED = '5-undetected'

MATCHER_UNDETECTED_ORDER = 4

# All values of match_coverage less than this value then they are not considered

# as perfect detections

IMPERFECT_MATCH_COVERAGE_THR = 100

# Values of match_coverage less than this are reported as `license_clues` matches

CLUES_MATCH_COVERAGE_THR = 60

# Low Relevance threshold

LOW_RELEVANCE_THRESHOLD = 70

# False positives to spurious and gibberish texts are found usually later in the file

# and matched to relatively short rules

# Threshold Value of start line after which a match to likely be a false positive

FALSE_POSITIVE_START_LINE_THRESHOLD = 1000

# Threshold Value of rule length below which a match to likely be a false positive

FALSE_POSITIVE_RULE_LENGTH_THRESHOLD = 3

class DetectionCategory(Enum):

"""

These are the primary types of Detections which a group of license

matches are classified into.

"""

PERFECT_DETECTION = 'perfect-detection'

UNKNOWN_INTRO_BEFORE_DETECTION = 'unknown-intro-before-detection'

UNKNOWN_FILE_REFERENCE_LOCAL = 'unknown-file-reference-local'

UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE = 'unknown-reference-in-file-to-package'

UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE = 'unknown-reference-in-file-to-nonexistent-package'

PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL = 'package-unknown-file-reference-local'

PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'

PACKAGE_ADD_FROM_FILE = 'from-package-file'

EXTRA_WORDS = 'extra-words'

UNKNOWN_MATCH = 'unknown-match'

LICENSE_CLUES = 'license-clues'

LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'

IMPERFECT_COVERAGE = 'imperfect-match-coverage'

FALSE_POSITVE = 'possible-false-positive'

UNDETECTED_LICENSE = 'undetected-license'

LOW_RELEVANCE = 'low-relevance'

class DetectionRule(Enum):

"""

These are secondary types of Detections/Heuristics which are applied to the

group of LicenseMatch objects to create a LicenseDetection object and it's

effective `license_expression`.

These are logged in LicenseDetection.detection_log for verbosity.

"""

UNKNOWN_MATCH = 'unknown-match'

EXTRA_WORDS = 'extra-words'

LICENSE_CLUES = 'license-clues'

LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'

IMPERFECT_COVERAGE = 'imperfect-match-coverage'

FALSE_POSITIVE = 'possible-false-positive'

NOT_LICENSE_CLUES = 'not-license-clues-as-more-detections-present'

UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'

UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'

UNKNOWN_REFERENCE_IN_FILE_TO_PACKAGE = 'unknown-reference-in-file-to-package'

UNKNOWN_REFERENCE_IN_FILE_TO_NONEXISTENT_PACKAGE = 'unknown-reference-in-file-to-nonexistent-package'

CONTAINED_SAME_LICENSE = 'contained-with-same-license'

UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned'

UNDETECTED_LICENSE = 'undetected-license'

PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'

PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'

PACKAGE_ADD_FROM_FILE = 'from-package-file'

@attr.s

class FileRegion:

"""

A file has one or more file-regions, which are separate regions of the file

containing some license information (separated by code/text/others in between),

and identified by a start line and an end line.

"""

path = attr.ib(type=str)

start_line = attr.ib(type=int)

end_line = attr.ib(type=int)

def to_dict(self):

return attr.asdict(self, dict_factory=dict)

@attr.s(slots=True, eq=False, order=False)

class LicenseDetection:

"""

A LicenseDetection combines one or more LicenseMatch using multiple rules

and heuristics. For instance, a "license intro" match followed by a proper

match may be combined in a single detection for the matched license

expression.

"""

license_expression = attr.ib(

default=None,

metadata=dict(

help='A license expression string using the SPDX license expression'

' syntax and ScanCode license keys, the effective license expression'

' for this license detection.')

)

license_expression_spdx = attr.ib(

default=None,

metadata=dict(

help='SPDX license expression string with SPDX ids.')

)

matches = attr.ib(

default=attr.Factory(list),

metadata=dict(

help='List of license matches combined in this detection.'

)

detection_log = attr.ib(

repr=False,

default=attr.Factory(list),

metadata=dict(

help='A list of detection DetectionRule explaining how '

'this detection was created.'

)

identifier = attr.ib(

default=None,

metadata=dict(

help='An identifier unique for a license detection, containing the license '

'expression and a UUID crafted from the match contents.')

)

# Only used in unique detection calculation and referencing

file_region = attr.ib(

default=attr.Factory(dict),

metadata=dict(

help='File path and start end lines to locate the detection.'

)

@classmethod

def from_matches(

cls,

matches,

analysis=None,

post_scan=False,

package_license=False,

"""

Return a LicenseDetection created out of `matches` list of

LicenseMatch objects.

If `analysis` is , `matches` are not analyzed again for

license_expression creation.

If `post_scan` is True, this function is called outside

the main license detection step.

"""

if TRACE:

logger_debug(f"LicenseDetection: from_matches: matches: {matches}")

if not matches:

return

if analysis is None:

analysis = analyze_detection(

license_matches=matches,

package_license=package_license

)

detection_log, license_expression = get_detected_license_expression(

analysis=analysis,

license_matches=matches,

post_scan=post_scan,

)

if license_expression == None:

return cls(

matches=matches,

detection_log=detection_log,

)

detection = cls(

matches=matches,

license_expression=str(license_expression),

detection_log=detection_log,

)

detection.identifier = detection.identifier_with_expression

detection.license_expression_spdx = detection.spdx_license_expression()

return detection

def spdx_license_expression(self):

return str(build_spdx_license_expression(

license_expression=self.license_expression,

licensing=get_licensing(),

))

def __eq__(self, other):

return (

isinstance(other, LicenseDetection)

and self.matches == other.matches

)

@property

def query(self):

# A LicenseDetection will always be created with matches

assert self.matches

# All the matches in a file or in a LicenseDetection point to the

# same query

return self.matches[0].query

@property

def qspans(self):

return [match.qspan for match in self.matches]

def get_file_region(self, path):

"""

This is an identifier for a license detection, based on it's underlying

license matches.

"""

start_line, end_line = self.get_start_end_line()

return FileRegion(

path=path,

start_line=start_line,

end_line=end_line,

)

@property

def _identifier(self):

"""

Return an unique identifier for a license detection, based on it's

underlying license matches with the tokenized matched_text.

"""

data = []

for match in self.matches:

matched_text = match.matched_text

if isinstance(matched_text, typing.Callable):

matched_text = matched_text()

if matched_text is None:

matched_text = ''

if not isinstance(matched_text, str):

matched_text = repr(matched_text)

tokenized_matched_text = tuple(query_tokenizer(matched_text))

identifier = (

match.rule.identifier,

match.score(),

tokenized_matched_text,

)

data.append(identifier)

# Return a uuid generated from the contents of the matches

return get_uuid_on_content(content=data)

@property

def identifier_with_expression(self):

"""

Return an identifer for a license detection with the license expression

and an UUID created from the detection contents.

"""

id_safe_expression = python_safe_name(s=str(self.license_expression))

return "{}-{}".format(id_safe_expression, self._identifier)

@property

def is_unknown(self):

"""

Return True if there are unknown license keys in the license expression

for this detection, return False otherwise. By design these are licenses with "unknown" in

their key.

"""

return 'unknown' in self.license_expression

def get_start_end_line(self):

"""

Return start and end line for a license detection issue, from the

license match(es).

"""

if isinstance(self.matches[0], dict):

start_line = min([match['start_line'] for match in self.matches])

end_line = max([match['end_line'] for match in self.matches])

else:

start_line = min([match.start_line for match in self.matches])

end_line = max([match.end_line for match in self.matches])

return start_line, end_line

def rules_length(self):

"""

Return the length of the combined matched rules as the number

of all rule tokens.

Because of the possible overlap this may be inaccurate.

"""

return sum(m.self.rule.length for m in self.matches)

def coverage(self):

"""

Return the score for this detection as a rounded float between 0 and 100.

This is an indication of the how much this detection covers the rules of

the underlying match.

This is computed as the sum of the underlying matches coverage weighted

by the length of a match to the overall detection length.

"""

length = self.length

weighted_coverages = (m.coverage() * (m.len() / length) for m in self.matches)

return min([round(sum(weighted_coverages), 2), 100])

def relevance(self):

"""

Return the ``relevance`` of this detection. The relevance

is a float between 0 and 100 where 100 means highly relevant and 0 means

not relevant at all.

This is computed as the relevance of the sum of the underlying matches

rule length.

"""

return compute_relevance(self.rules_length())

def score(self):

"""

Return the score for this detection as a rounded float between 0 and 100.

The score is an indication of the confidence of the detection.

This is computed as the sum of the underlying matches score weighted

by the length of a match to the overall detection length.

"""

length = self.length

weighted_scores = (m.score() * (m.len() / length) for m in self.matches)

return min([round(sum(weighted_scores), 2), 100])

def append(

self,

match,

reason=None,

combine_license=False,

override_license=False,

"""

Append the ``match`` LicenseMatch to this detection and update it

accordingly. Append the ``reason`` to the detection_log.

If ``combine_license`` is True the license_expression of the ``match``

is combined with the detection license_expression. Do not combine

otherwise.

If ``override_license`` is True, the license_expression of the ``match``

replaces the the detection license_expression. Do not override license

otherwise.

``combine_license`` and ``override_license`` are ignored for the first

match appended to this detection: license is taken as is in this case.

"""

if not isinstance(match, LicenseMatch):

raise TypeError(f'Not a LicenseMatch: {match!r}')

assert not (combine_license and override_license), (

'combine_license and override_license are mutually exclusive'

)

if not self.matches:

# first match is always an ovveride

combine_license = False

override_license = True

self.matches.append(match)

self.length += match.length

if reason:

self.detection_log.append(reason)

licensing = get_licensing()

if combine_license:

license_expression = combine_expressions(

[self.license_expression, match.license_expression],

unique=True,

licensing=licensing,

)

self.license_expression = str(license_expression)

elif override_license:

# Use the match expression

license_expression = licensing.parse(match.license_expression)

self.license_expression = str(license_expression)

def percentage_license_text_of_file(self, qspans):

"""

Return the percentage of license text in the file where the

license was detected, from a list of `qspans`.

Here qspans is a list of all individual qspans corresponding

to the LicenseDetections for the file.

"""

matched_tokens_length = len(Span().union(*qspans))

query_tokens_length = self.query.tokens_length(with_unknown=True)

return round((matched_tokens_length / query_tokens_length) * 100, 2)

def to_dict(

self,

include_text=False,

license_text_diagnostics=False,

license_diagnostics=False,

whole_lines=True,

"""

Return a mapping for LicenseDetection objects.

"""

def dict_fields(attr, value):

if attr.name == 'file_region':

return False

if attr.name == 'detection_log' and not license_diagnostics:

return False

return True

data_matches = []

for match in self.matches:

data_matches.append(

match.to_dict(

include_text=include_text,

license_text_diagnostics=license_text_diagnostics,

whole_lines=whole_lines,

)

detection = attr.asdict(self, filter=dict_fields, dict_factory=dict)

detection["matches"] = data_matches

return detection

def get_uuid_on_content(content):

"""

Return an UUID based on the contents of a list, which should be

a list of hashable elements.

"""

identifier_string = repr(tuple(content))

md_hash = sha1(identifier_string.encode('utf-8'))

return str(uuid.UUID(hex=md_hash.hexdigest()[:32]))

@attr.s

class LicenseDetectionFromResult(LicenseDetection):

"""

A LicenseDetection object that is created and rehydrated from a

LicenseDetection mapping. The LicenseMatch objects in the

`matches` will be LicenseMatchFromResult objects too, as these are

created from data mappings and don't have the input text/spans

available.

"""

@classmethod

def from_license_detection_mapping(

cls,

license_detection_mapping,

file_path,

"""

Return a LicenseDetectionFromResult objects created from a LicenseDetection

mapping `license_detection_mapping`.

"""

matches = LicenseMatchFromResult.from_dicts(

license_match_mappings=license_detection_mapping["matches"]

)

detection = cls(

license_expression=license_detection_mapping["license_expression"],

license_expression_spdx=license_detection_mapping["license_expression_spdx"],

detection_log=license_detection_mapping.get("detection_log", []) or None,

identifier=license_detection_mapping["identifier"],

matches=matches,

file_region=None,

)

detection.file_region = detection.get_file_region(path=file_path)

return detection

def detections_from_license_detection_mappings(

license_detection_mappings,

file_path,

"""

Return a list of LicenseDetectionFromResult objects created from a

list of LicenseDetection mappings: `license_detection_mappings`.

"""

license_detections = []

for license_detection_mapping in license_detection_mappings:

license_detections.append(

LicenseDetectionFromResult.from_license_detection_mapping(

license_detection_mapping=license_detection_mapping,

file_path=file_path,

)

return license_detections

def get_new_identifier_from_detections(initial_detection, detections_added, license_expression):

"""

Return a new UUID based on two sets of detections: `initial_detection` is

the detection being modified with a list of detections (from another file region)

`detections_added`.

"""

identifiers = [

detection_mapping["identifier"]

for detection_mapping in detections_added

]

identifiers.append(initial_detection["identifier"])

uuid = get_uuid_on_content(content=sorted(identifiers))

expression = python_safe_name(s=str(license_expression))

return f"{expression}-{uuid}"

@attr.s

class LicenseMatchFromResult(LicenseMatch):

"""

A LicenseMatch object recreated from a LicenseMatch data mapping.

"""

match_score = attr.ib(

default=None,

metadata=dict(

help='License Detection Score')

)

matched_length = attr.ib(

default=None,

metadata=dict(

help='License match length')

)

match_coverage = attr.ib(

default=None,

metadata=dict(

help='License match coverage')

)

text = attr.ib(

default=None,

metadata=dict(

help='Text which was matched')

)

matched_text_diagnostics = attr.ib(

default=None,

metadata=dict(

help='Text which was matched, with extra diagnostics information.')

)

def score(self):

return self.match_score

def len(self):

return self.matched_length

def coverage(self):

return self.match_coverage

def matched_text(self, whole_lines=False, highlight=True):

return self.text

@property

def identifier(self):

return self.rule.identifier

@classmethod

def from_dict(cls, license_match_mapping):

"""

Return a LicenseMatchFromResult object from a ``license_match_mapping``

LicenseMatch data mappping.

"""

rule = Rule.from_match_data(license_match_mapping)

matched_text = license_match_mapping.get("matched_text") or None

matched_text_diagnostics = license_match_mapping.get("matched_text_diagnostics") or None

return cls(

from_file=license_match_mapping["from_file"],

start_line=license_match_mapping["start_line"],

end_line=license_match_mapping["end_line"],

match_score=license_match_mapping["score"],

matched_length=license_match_mapping["matched_length"],

match_coverage=license_match_mapping["match_coverage"],

matcher=license_match_mapping["matcher"],

text=matched_text,

matched_text_diagnostics=matched_text_diagnostics,

rule=rule,

qspan=None,

ispan=None,

)

@classmethod

def from_dicts(cls, license_match_mappings):

"""

Return a LicenseMatchFromResult object from a ``license_match_mapping`s`

list of LicenseMatch data mapppings.

"""

return [LicenseMatchFromResult.from_dict(lmm) for lmm in license_match_mappings]

def to_dict(

self,

include_text=False,

license_text_diagnostics=False,

whole_lines=True,

rule_details=False,

"""

Return a "result" scan data built from a LicenseMatch object.

"""

result = {}

result['license_expression'] = self.rule.license_expression

result['license_expression_spdx'] = self.rule.spdx_license_expression()

result['from_file'] = self.from_file

result['start_line'] = self.start_line

result['end_line'] = self.end_line

if rule_details:

result.update(self.rule.get_flags_mapping())

result['matcher'] = self.matcher

result['score'] = self.score()

result['matched_length'] = self.len()

if rule_details:

result["rule_length"] = self.rule.length

result['match_coverage'] = self.coverage()

result['rule_relevance'] = self.rule.relevance

result['rule_identifier'] = self.rule.identifier

result['rule_url'] = self.rule.rule_url

if rule_details:

result["rule_notes"] = self.rule.notes

result["referenced_filenames"] = self.rule.referenced_filenames

if include_text and self.text:

result['matched_text'] = self.text

if license_text_diagnostics and self.matched_text_diagnostics:

result['matched_text_diagnostics'] = self.matched_text_diagnostics

if rule_details:

result["rule_text"] = self.rule.text

return result

def populate_matches_with_path(matches, path):

"""

Given `matches` list of LicenseMatch objects, populate the `from_file`

attribute in them with `path` which is the path for the origin file for

that license match.

"""

for match in matches:

# Here if we have the `from_file` attribute populated already,

# they are from other files, and if it's empty, they are from

# the original resource, so we populate the files with the resource

# path for the original resource of their origin

if not match["from_file"]:

match["from_file"] = path

def collect_license_detections(codebase, include_license_clues=True):

"""

Return a list of LicenseDetectionFromResult object rehydrated from

LicenseDetection mappings, from resources and packages in a ``codebase``.

As a side effect, this also corrects `declared_license_expression` in packages

according to their license detections. This is required because package fields

are populated in package plugin, which runs before the license plugin, and thus

the license plugin step where unknown references to other files are dereferenced

does not show up automatically in package attributes.

Also populate from_file attributes with resource paths for matches which have

origin in the same file.

"""

has_packages = hasattr(codebase.root, 'package_data')

has_licenses = hasattr(codebase.root, 'license_detections')

all_license_detections = []

for resource in codebase.walk():

resource_license_detections = []

if has_licenses:

license_detections = getattr(resource, 'license_detections', []) or []

for detection in license_detections:

populate_matches_with_path(matches=detection["matches"], path=resource.path)

license_clues = getattr(resource, 'license_clues', []) or []

populate_matches_with_path(matches=license_clues, path=resource.path)

codebase.save_resource(resource)

if license_detections:

license_detection_objects = detections_from_license_detection_mappings(

license_detection_mappings=license_detections,

file_path=resource.path,

)

resource_license_detections.extend(license_detection_objects)

if include_license_clues and license_clues:

license_matches = LicenseMatchFromResult.from_dicts(

license_match_mappings=license_clues,

)

for group_of_matches in group_matches(license_matches=license_matches):

detection = LicenseDetection.from_matches(matches=group_of_matches)

detection.file_region = detection.get_file_region(path=resource.path)

resource_license_detections.append(detection)

all_license_detections.extend(resource_license_detections)

if TRACE:

logger_debug(

f'license detections collected at path {resource.path}:',

f'resource_license_detections: {resource_license_detections}\n',

f'all_license_detections: {all_license_detections}',

)

if has_packages:

package_data = getattr(resource, 'package_data', []) or []

package_license_detection_mappings = []

modified = False

for package in package_data:

package_license_detections = package["license_detections"]

if package_license_detections:

for detection in package_license_detections:

populate_matches_with_path(matches=detection["matches"], path=resource.path)

modified = True

package_license_detection_mappings.extend(package_license_detections)

detection_is_same, license_expression = verify_package_license_expression(

license_detection_mappings=package_license_detections,

license_expression=package["declared_license_expression"]

)

if not detection_is_same:

package["declared_license_expression"] = license_expression

package["declared_license_expression_spdx"] = str(build_spdx_license_expression(

license_expression=license_expression,

licensing=get_licensing(),

))

modified = True

other_license_detections = package["other_license_detections"]

if other_license_detections:

package_license_detection_mappings.extend(other_license_detections)

detection_is_same, license_expression = verify_package_license_expression(

license_detection_mappings=other_license_detections,

license_expression=package["other_license_expression"]

)

if not detection_is_same:

package["other_license_expression"] = license_expression

package["other_license_expression_spdx"] = str(build_spdx_license_expression(

license_expression=license_expression,

licensing=get_licensing(),

))

modified = True

if modified:

codebase.save_resource(resource)

if package_license_detection_mappings:

package_license_detection_objects = detections_from_license_detection_mappings(

license_detection_mappings=package_license_detection_mappings,

file_path=resource.path,

)

all_license_detections.extend(package_license_detection_objects)

if has_packages and has_licenses:

for package in getattr(codebase.attributes, 'packages', []):

license_expression_package = package["declared_license_expression"]

if not license_expression_package:

continue

resource_paths = package["datafile_paths"]

if len(resource_paths) == 1:

resource_path = resource_paths[0]

else:

# TODO: implement the correct consistency check

# based on which datafile path the license came from

resource_path = resource_paths[0]

resource = codebase.get_resource(path=resource_path)

resource_packages = getattr(resource, 'package_data', None)

if not resource_packages or len(resource_packages) > 1:

continue

resource_package = resource_packages[0]

if license_expression_package != resource_package["declared_license_expression"]:

package["license_detections"] = resource_package["license_detections"]

package["declared_license_expression"] = resource_package["declared_license_expression"]

package["declared_license_expression_spdx"] = resource_package["declared_license_expression_spdx"]

return all_license_detections

def verify_package_license_expression(license_detection_mappings, license_expression):

"""

Returns a tuple of two files: `detection_is_same` and `license_expression` depending

on whether the `license_expression` is same as the license_expression computed from

`license_detection_mappings`:

1. If they are the same, we return True and None for the `license_expression`

2. If they are not the same, we return False, and the computed `license_expression`

"""

license_expressions_from_detections = [

detection["license_expression"]

for detection in license_detection_mappings

]

license_expression_from_detections = str(combine_expressions(

expressions=license_expressions_from_detections,

relation='AND',

unique=True,

licensing=get_licensing(),

))

if not license_expression_from_detections == license_expression:

return False, license_expression_from_detections

else:

return True, None

@attr.s

class UniqueDetection:

"""

An unique License Detection.

"""

identifier = attr.ib(default=None)

license_expression = attr.ib(default=None)

license_expression_spdx = attr.ib(default=None)

detection_count = attr.ib(default=None)

matches = attr.ib(default=attr.Factory(list))

detection_log = attr.ib(default=attr.Factory(list))

file_regions = attr.ib(factory=list)

@property

def is_unknown(self):

"""

Return True if there are unknown license keys in the license expression

for this detection, return False otherwise. By design these are licenses with "unknown" in

their key.

"""

return 'unknown' in self.license_expression

@classmethod

def get_unique_detections(cls, license_detections):

"""

Return all unique UniqueDetection from a ``license_detections`` list of

LicenseDetection.

"""

licensing = get_licensing()

detections_by_id = get_detections_by_id(license_detections)

unique_license_detections = []

for all_detections in detections_by_id.values():

file_regions = [

detection.file_region

for detection in all_detections

]

detection = next(iter(all_detections))

detection_log = []

if hasattr(detection, "detection_log"):

if detection.detection_log:

detection_log.extend(detection.detection_log)

if not detection.license_expression:

detection.license_expression = str(combine_expressions(

expressions=[

match.rule.license_expression

for match in detection.matches

licensing=licensing ,

))

detection.license_expression_spdx = detection.spdx_license_expression()

detection.identifier = detection.identifier_with_expression

unique_license_detections.append(

cls(

identifier=detection.identifier,

license_expression=detection.license_expression,

license_expression_spdx=detection.license_expression_spdx,

detection_log=detection_log or [],

matches=detection.matches,

detection_count=len(file_regions),

file_regions=file_regions,

)

return unique_license_detections

def to_dict(self,

include_text=False,

license_text_diagnostics=False,

license_diagnostics=False,

def dict_fields(attr, value):

if attr.name == 'file_regions':

return False

if attr.name == 'matches':

return False

if attr.name == 'detection_log' and not license_diagnostics:

return False

return True

detection_mapping = attr.asdict(self, filter=dict_fields)

detection_mapping["reference_matches"] = [

match.to_dict(

include_text=include_text,

license_text_diagnostics=license_text_diagnostics,

)

for match in self.matches

]

return detection_mapping

def get_license_detection_object(self):

return LicenseDetection(

license_expression=self.license_expression,

license_expression_spdx=self.license_expression_spdx,

detection_log=self.detection_log,

matches=self.matches,

identifier=self.identifier,

file_region=None,

)

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

detection.py

Latest commit

History

detection.py

File metadata and controls