RenameDetector.java
- /*
- * Copyright (C) 2010, Google Inc. and others
- *
- * This program and the accompanying materials are made available under the
- * terms of the Eclipse Distribution License v. 1.0 which is available at
- * https://www.eclipse.org/org/documents/edl-v10.php.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
- package org.eclipse.jgit.diff;
- import static org.eclipse.jgit.diff.DiffEntry.Side.NEW;
- import static org.eclipse.jgit.diff.DiffEntry.Side.OLD;
- import static org.eclipse.jgit.storage.pack.PackConfig.DEFAULT_BIG_FILE_THRESHOLD;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collection;
- import java.util.Collections;
- import java.util.Comparator;
- import java.util.HashMap;
- import java.util.List;
- import org.eclipse.jgit.api.errors.CanceledException;
- import org.eclipse.jgit.diff.DiffEntry.ChangeType;
- import org.eclipse.jgit.diff.SimilarityIndex.TableFullException;
- import org.eclipse.jgit.internal.JGitText;
- import org.eclipse.jgit.lib.AbbreviatedObjectId;
- import org.eclipse.jgit.lib.FileMode;
- import org.eclipse.jgit.lib.NullProgressMonitor;
- import org.eclipse.jgit.lib.ObjectReader;
- import org.eclipse.jgit.lib.ProgressMonitor;
- import org.eclipse.jgit.lib.Repository;
- /**
- * Detect and resolve object renames.
- */
- public class RenameDetector {
- private static final int EXACT_RENAME_SCORE = 100;
- private static final Comparator<DiffEntry> DIFF_COMPARATOR = new Comparator<>() {
- @Override
- public int compare(DiffEntry a, DiffEntry b) {
- int cmp = nameOf(a).compareTo(nameOf(b));
- if (cmp == 0)
- cmp = sortOf(a.getChangeType()) - sortOf(b.getChangeType());
- return cmp;
- }
- private String nameOf(DiffEntry ent) {
- // Sort by the new name, unless the change is a delete. On
- // deletes the new name is /dev/null, so we sort instead by
- // the old name.
- //
- if (ent.changeType == ChangeType.DELETE)
- return ent.oldPath;
- return ent.newPath;
- }
- private int sortOf(ChangeType changeType) {
- // Sort deletes before adds so that a major type change for
- // a file path (such as symlink to regular file) will first
- // remove the path, then add it back with the new type.
- //
- switch (changeType) {
- case DELETE:
- return 1;
- case ADD:
- return 2;
- default:
- return 10;
- }
- }
- };
- private List<DiffEntry> entries;
- private List<DiffEntry> deleted;
- private List<DiffEntry> added;
- private boolean done;
- private final ObjectReader objectReader;
- /** Similarity score required to pair an add/delete as a rename. */
- private int renameScore = 60;
- /**
- * Similarity score required to keep modified file pairs together. Any
- * modified file pairs with a similarity score below this will be broken
- * apart.
- */
- private int breakScore = -1;
- /** Limit in the number of files to consider for renames. */
- private int renameLimit;
- /**
- * File size threshold (in bytes) for detecting renames. Files larger
- * than this size will not be processed for renames.
- */
- private int bigFileThreshold = DEFAULT_BIG_FILE_THRESHOLD;
- /**
- * Skip detecting content renames for binary files. Content renames are
- * those that are not exact, that is with a slight content modification
- * between the two files.
- */
- private boolean skipContentRenamesForBinaryFiles = false;
- /** Set if the number of adds or deletes was over the limit. */
- private boolean overRenameLimit;
- /**
- * Create a new rename detector for the given repository
- *
- * @param repo
- * the repository to use for rename detection
- */
- public RenameDetector(Repository repo) {
- this(repo.newObjectReader(), repo.getConfig().get(DiffConfig.KEY));
- }
- /**
- * Create a new rename detector with a specified reader and diff config.
- *
- * @param reader
- * reader to obtain objects from the repository with.
- * @param cfg
- * diff config specifying rename detection options.
- * @since 3.0
- */
- public RenameDetector(ObjectReader reader, DiffConfig cfg) {
- objectReader = reader.newReader();
- renameLimit = cfg.getRenameLimit();
- reset();
- }
- /**
- * Get rename score
- *
- * @return minimum score required to pair an add/delete as a rename. The
- * score ranges are within the bounds of (0, 100).
- */
- public int getRenameScore() {
- return renameScore;
- }
- /**
- * Set the minimum score required to pair an add/delete as a rename.
- * <p>
- * When comparing two files together their score must be greater than or
- * equal to the rename score for them to be considered a rename match. The
- * score is computed based on content similarity, so a score of 60 implies
- * that approximately 60% of the bytes in the files are identical.
- *
- * @param score
- * new rename score, must be within [0, 100].
- * @throws java.lang.IllegalArgumentException
- * the score was not within [0, 100].
- */
- public void setRenameScore(int score) {
- if (score < 0 || score > 100)
- throw new IllegalArgumentException(
- JGitText.get().similarityScoreMustBeWithinBounds);
- renameScore = score;
- }
- /**
- * Get break score
- *
- * @return the similarity score required to keep modified file pairs
- * together. Any modify pairs that score below this will be broken
- * apart into separate add/deletes. Values less than or equal to
- * zero indicate that no modifies will be broken apart. Values over
- * 100 cause all modify pairs to be broken.
- */
- public int getBreakScore() {
- return breakScore;
- }
- /**
- * Set break score
- *
- * @param breakScore
- * the similarity score required to keep modified file pairs
- * together. Any modify pairs that score below this will be
- * broken apart into separate add/deletes. Values less than or
- * equal to zero indicate that no modifies will be broken apart.
- * Values over 100 cause all modify pairs to be broken.
- */
- public void setBreakScore(int breakScore) {
- this.breakScore = breakScore;
- }
- /**
- * Get rename limit
- *
- * @return limit on number of paths to perform inexact rename detection
- */
- public int getRenameLimit() {
- return renameLimit;
- }
- /**
- * Set the limit on the number of files to perform inexact rename detection.
- * <p>
- * The rename detector has to build a square matrix of the rename limit on
- * each side, then perform that many file compares to determine similarity.
- * If 1000 files are added, and 1000 files are deleted, a 1000*1000 matrix
- * must be allocated, and 1,000,000 file compares may need to be performed.
- *
- * @param limit
- * new file limit. 0 means no limit; a negative number means no
- * inexact rename detection will be performed, only exact rename
- * detection.
- */
- public void setRenameLimit(int limit) {
- renameLimit = limit;
- }
- /**
- * Get file size threshold for detecting renames. Files larger
- * than this size will not be processed for rename detection.
- *
- * @return threshold in bytes of the file size.
- * @since 5.12
- */
- public int getBigFileThreshold() { return bigFileThreshold; }
- /**
- * Set the file size threshold for detecting renames. Files larger than this
- * threshold will be skipped during rename detection computation.
- *
- * @param threshold file size threshold in bytes.
- * @since 5.12
- */
- public void setBigFileThreshold(int threshold) {
- this.bigFileThreshold = threshold;
- }
- /**
- * Get skipping detecting content renames for binary files.
- *
- * @return true if content renames should be skipped for binary files, false otherwise.
- * @since 5.12
- */
- public boolean getSkipContentRenamesForBinaryFiles() {
- return skipContentRenamesForBinaryFiles;
- }
- /**
- * Sets skipping detecting content renames for binary files.
- *
- * @param value true if content renames should be skipped for binary files, false otherwise.
- * @since 5.12
- */
- public void setSkipContentRenamesForBinaryFiles(boolean value) {
- this.skipContentRenamesForBinaryFiles = value;
- }
- /**
- * Check if the detector is over the rename limit.
- * <p>
- * This method can be invoked either before or after {@code getEntries} has
- * been used to perform rename detection.
- *
- * @return true if the detector has more file additions or removals than the
- * rename limit is currently set to. In such configurations the
- * detector will skip expensive computation.
- */
- public boolean isOverRenameLimit() {
- if (done)
- return overRenameLimit;
- int cnt = Math.max(added.size(), deleted.size());
- return getRenameLimit() != 0 && getRenameLimit() < cnt;
- }
- /**
- * Add entries to be considered for rename detection.
- *
- * @param entriesToAdd
- * one or more entries to add.
- * @throws java.lang.IllegalStateException
- * if {@code getEntries} was already invoked.
- */
- public void addAll(Collection<DiffEntry> entriesToAdd) {
- if (done)
- throw new IllegalStateException(JGitText.get().renamesAlreadyFound);
- for (DiffEntry entry : entriesToAdd) {
- switch (entry.getChangeType()) {
- case ADD:
- added.add(entry);
- break;
- case DELETE:
- deleted.add(entry);
- break;
- case MODIFY:
- if (sameType(entry.getOldMode(), entry.getNewMode())) {
- entries.add(entry);
- } else {
- List<DiffEntry> tmp = DiffEntry.breakModify(entry);
- deleted.add(tmp.get(0));
- added.add(tmp.get(1));
- }
- break;
- case COPY:
- case RENAME:
- default:
- entries.add(entry);
- }
- }
- }
- /**
- * Add an entry to be considered for rename detection.
- *
- * @param entry
- * to add.
- * @throws java.lang.IllegalStateException
- * if {@code getEntries} was already invoked.
- */
- public void add(DiffEntry entry) {
- addAll(Collections.singletonList(entry));
- }
- /**
- * Detect renames in the current file set.
- * <p>
- * This convenience function runs without a progress monitor.
- * </p>
- *
- * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
- * representing all files that have been changed.
- * @throws java.io.IOException
- * file contents cannot be read from the repository.
- */
- public List<DiffEntry> compute() throws IOException {
- try {
- return compute(NullProgressMonitor.INSTANCE);
- } catch (CanceledException e) {
- // Won't happen with a NullProgressMonitor
- return Collections.emptyList();
- }
- }
- /**
- * Detect renames in the current file set.
- *
- * @param pm
- * report progress during the detection phases.
- * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
- * representing all files that have been changed.
- * @throws java.io.IOException
- * file contents cannot be read from the repository.
- * @throws CanceledException
- * if rename detection was cancelled
- */
- public List<DiffEntry> compute(ProgressMonitor pm)
- throws IOException, CanceledException {
- if (!done) {
- try {
- return compute(objectReader, pm);
- } finally {
- objectReader.close();
- }
- }
- return Collections.unmodifiableList(entries);
- }
- /**
- * Detect renames in the current file set.
- *
- * @param reader
- * reader to obtain objects from the repository with.
- * @param pm
- * report progress during the detection phases.
- * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
- * representing all files that have been changed.
- * @throws java.io.IOException
- * file contents cannot be read from the repository.
- * @throws CanceledException
- * if rename detection was cancelled
- */
- public List<DiffEntry> compute(ObjectReader reader, ProgressMonitor pm)
- throws IOException, CanceledException {
- final ContentSource cs = ContentSource.create(reader);
- return compute(new ContentSource.Pair(cs, cs), pm);
- }
- /**
- * Detect renames in the current file set.
- *
- * @param reader
- * reader to obtain objects from the repository with.
- * @param pm
- * report progress during the detection phases.
- * @return an unmodifiable list of {@link org.eclipse.jgit.diff.DiffEntry}s
- * representing all files that have been changed.
- * @throws java.io.IOException
- * file contents cannot be read from the repository.
- * @throws CanceledException
- * if rename detection was cancelled
- */
- public List<DiffEntry> compute(ContentSource.Pair reader, ProgressMonitor pm)
- throws IOException, CanceledException {
- if (!done) {
- done = true;
- if (pm == null)
- pm = NullProgressMonitor.INSTANCE;
- if (0 < breakScore)
- breakModifies(reader, pm);
- if (!added.isEmpty() && !deleted.isEmpty())
- findExactRenames(pm);
- if (!added.isEmpty() && !deleted.isEmpty())
- findContentRenames(reader, pm);
- if (0 < breakScore && !added.isEmpty() && !deleted.isEmpty())
- rejoinModifies(pm);
- entries.addAll(added);
- added = null;
- entries.addAll(deleted);
- deleted = null;
- Collections.sort(entries, DIFF_COMPARATOR);
- }
- return Collections.unmodifiableList(entries);
- }
- /**
- * Reset this rename detector for another rename detection pass.
- */
- public void reset() {
- entries = new ArrayList<>();
- deleted = new ArrayList<>();
- added = new ArrayList<>();
- done = false;
- }
- private void advanceOrCancel(ProgressMonitor pm) throws CanceledException {
- if (pm.isCancelled()) {
- throw new CanceledException(JGitText.get().renameCancelled);
- }
- pm.update(1);
- }
- private void breakModifies(ContentSource.Pair reader, ProgressMonitor pm)
- throws IOException, CanceledException {
- ArrayList<DiffEntry> newEntries = new ArrayList<>(entries.size());
- pm.beginTask(JGitText.get().renamesBreakingModifies, entries.size());
- for (int i = 0; i < entries.size(); i++) {
- DiffEntry e = entries.get(i);
- if (e.getChangeType() == ChangeType.MODIFY) {
- int score = calculateModifyScore(reader, e);
- if (score < breakScore) {
- List<DiffEntry> tmp = DiffEntry.breakModify(e);
- DiffEntry del = tmp.get(0);
- del.score = score;
- deleted.add(del);
- added.add(tmp.get(1));
- } else {
- newEntries.add(e);
- }
- } else {
- newEntries.add(e);
- }
- advanceOrCancel(pm);
- }
- entries = newEntries;
- }
- private void rejoinModifies(ProgressMonitor pm) throws CanceledException {
- HashMap<String, DiffEntry> nameMap = new HashMap<>();
- ArrayList<DiffEntry> newAdded = new ArrayList<>(added.size());
- pm.beginTask(JGitText.get().renamesRejoiningModifies, added.size()
- + deleted.size());
- for (DiffEntry src : deleted) {
- nameMap.put(src.oldPath, src);
- advanceOrCancel(pm);
- }
- for (DiffEntry dst : added) {
- DiffEntry src = nameMap.remove(dst.newPath);
- if (src != null) {
- if (sameType(src.oldMode, dst.newMode)) {
- entries.add(DiffEntry.pair(ChangeType.MODIFY, src, dst,
- src.score));
- } else {
- nameMap.put(src.oldPath, src);
- newAdded.add(dst);
- }
- } else {
- newAdded.add(dst);
- }
- advanceOrCancel(pm);
- }
- added = newAdded;
- deleted = new ArrayList<>(nameMap.values());
- }
- private int calculateModifyScore(ContentSource.Pair reader, DiffEntry d)
- throws IOException {
- try {
- SimilarityIndex src = new SimilarityIndex();
- src.hash(reader.open(OLD, d));
- src.sort();
- SimilarityIndex dst = new SimilarityIndex();
- dst.hash(reader.open(NEW, d));
- dst.sort();
- return src.score(dst, 100);
- } catch (TableFullException tableFull) {
- // If either table overflowed while being constructed, don't allow
- // the pair to be broken. Returning 1 higher than breakScore will
- // ensure its not similar, but not quite dissimilar enough to break.
- //
- overRenameLimit = true;
- return breakScore + 1;
- }
- }
- private void findContentRenames(ContentSource.Pair reader,
- ProgressMonitor pm)
- throws IOException, CanceledException {
- int cnt = Math.max(added.size(), deleted.size());
- if (getRenameLimit() == 0 || cnt <= getRenameLimit()) {
- SimilarityRenameDetector d;
- d = new SimilarityRenameDetector(reader, deleted, added);
- d.setRenameScore(getRenameScore());
- d.setBigFileThreshold(getBigFileThreshold());
- d.setSkipBinaryFiles(getSkipContentRenamesForBinaryFiles());
- d.compute(pm);
- overRenameLimit |= d.isTableOverflow();
- deleted = d.getLeftOverSources();
- added = d.getLeftOverDestinations();
- entries.addAll(d.getMatches());
- } else {
- overRenameLimit = true;
- }
- }
- @SuppressWarnings("unchecked")
- private void findExactRenames(ProgressMonitor pm)
- throws CanceledException {
- pm.beginTask(JGitText.get().renamesFindingExact, //
- added.size() + added.size() + deleted.size()
- + added.size() * deleted.size());
- HashMap<AbbreviatedObjectId, Object> deletedMap = populateMap(deleted, pm);
- HashMap<AbbreviatedObjectId, Object> addedMap = populateMap(added, pm);
- ArrayList<DiffEntry> uniqueAdds = new ArrayList<>(added.size());
- ArrayList<List<DiffEntry>> nonUniqueAdds = new ArrayList<>();
- for (Object o : addedMap.values()) {
- if (o instanceof DiffEntry)
- uniqueAdds.add((DiffEntry) o);
- else
- nonUniqueAdds.add((List<DiffEntry>) o);
- }
- ArrayList<DiffEntry> left = new ArrayList<>(added.size());
- for (DiffEntry a : uniqueAdds) {
- Object del = deletedMap.get(a.newId);
- if (del instanceof DiffEntry) {
- // We have one add to one delete: pair them if they are the same
- // type
- DiffEntry e = (DiffEntry) del;
- if (sameType(e.oldMode, a.newMode)) {
- e.changeType = ChangeType.RENAME;
- entries.add(exactRename(e, a));
- } else {
- left.add(a);
- }
- } else if (del != null) {
- // We have one add to many deletes: find the delete with the
- // same type and closest name to the add, then pair them
- List<DiffEntry> list = (List<DiffEntry>) del;
- DiffEntry best = bestPathMatch(a, list);
- if (best != null) {
- best.changeType = ChangeType.RENAME;
- entries.add(exactRename(best, a));
- } else {
- left.add(a);
- }
- } else {
- left.add(a);
- }
- advanceOrCancel(pm);
- }
- for (List<DiffEntry> adds : nonUniqueAdds) {
- Object o = deletedMap.get(adds.get(0).newId);
- if (o instanceof DiffEntry) {
- // We have many adds to one delete: find the add with the same
- // type and closest name to the delete, then pair them. Mark the
- // rest as copies of the delete.
- DiffEntry d = (DiffEntry) o;
- DiffEntry best = bestPathMatch(d, adds);
- if (best != null) {
- d.changeType = ChangeType.RENAME;
- entries.add(exactRename(d, best));
- for (DiffEntry a : adds) {
- if (a != best) {
- if (sameType(d.oldMode, a.newMode)) {
- entries.add(exactCopy(d, a));
- } else {
- left.add(a);
- }
- }
- }
- } else {
- left.addAll(adds);
- }
- } else if (o != null) {
- // We have many adds to many deletes: score all the adds against
- // all the deletes by path name, take the best matches, pair
- // them as renames, then call the rest copies
- List<DiffEntry> dels = (List<DiffEntry>) o;
- long[] matrix = new long[dels.size() * adds.size()];
- int mNext = 0;
- for (int delIdx = 0; delIdx < dels.size(); delIdx++) {
- String deletedName = dels.get(delIdx).oldPath;
- for (int addIdx = 0; addIdx < adds.size(); addIdx++) {
- String addedName = adds.get(addIdx).newPath;
- int score = SimilarityRenameDetector.nameScore(addedName, deletedName);
- matrix[mNext] = SimilarityRenameDetector.encode(score, delIdx, addIdx);
- mNext++;
- if (pm.isCancelled()) {
- throw new CanceledException(
- JGitText.get().renameCancelled);
- }
- }
- }
- Arrays.sort(matrix);
- for (--mNext; mNext >= 0; mNext--) {
- long ent = matrix[mNext];
- int delIdx = SimilarityRenameDetector.srcFile(ent);
- int addIdx = SimilarityRenameDetector.dstFile(ent);
- DiffEntry d = dels.get(delIdx);
- DiffEntry a = adds.get(addIdx);
- if (a == null) {
- advanceOrCancel(pm);
- continue; // was already matched earlier
- }
- ChangeType type;
- if (d.changeType == ChangeType.DELETE) {
- // First use of this source file. Tag it as a rename so we
- // later know it is already been used as a rename, other
- // matches (if any) will claim themselves as copies instead.
- //
- d.changeType = ChangeType.RENAME;
- type = ChangeType.RENAME;
- } else {
- type = ChangeType.COPY;
- }
- entries.add(DiffEntry.pair(type, d, a, 100));
- adds.set(addIdx, null); // Claim the destination was matched.
- advanceOrCancel(pm);
- }
- } else {
- left.addAll(adds);
- }
- advanceOrCancel(pm);
- }
- added = left;
- deleted = new ArrayList<>(deletedMap.size());
- for (Object o : deletedMap.values()) {
- if (o instanceof DiffEntry) {
- DiffEntry e = (DiffEntry) o;
- if (e.changeType == ChangeType.DELETE)
- deleted.add(e);
- } else {
- List<DiffEntry> list = (List<DiffEntry>) o;
- for (DiffEntry e : list) {
- if (e.changeType == ChangeType.DELETE)
- deleted.add(e);
- }
- }
- }
- pm.endTask();
- }
- /**
- * Find the best match by file path for a given DiffEntry from a list of
- * DiffEntrys. The returned DiffEntry will be of the same type as <src>. If
- * no DiffEntry can be found that has the same type, this method will return
- * null.
- *
- * @param src
- * the DiffEntry to try to find a match for
- * @param list
- * a list of DiffEntrys to search through
- * @return the DiffEntry from <list> who's file path best matches <src>
- */
- private static DiffEntry bestPathMatch(DiffEntry src, List<DiffEntry> list) {
- DiffEntry best = null;
- int score = -1;
- for (DiffEntry d : list) {
- if (sameType(mode(d), mode(src))) {
- int tmp = SimilarityRenameDetector
- .nameScore(path(d), path(src));
- if (tmp > score) {
- best = d;
- score = tmp;
- }
- }
- }
- return best;
- }
- @SuppressWarnings("unchecked")
- private HashMap<AbbreviatedObjectId, Object> populateMap(
- List<DiffEntry> diffEntries, ProgressMonitor pm)
- throws CanceledException {
- HashMap<AbbreviatedObjectId, Object> map = new HashMap<>();
- for (DiffEntry de : diffEntries) {
- Object old = map.put(id(de), de);
- if (old instanceof DiffEntry) {
- ArrayList<DiffEntry> list = new ArrayList<>(2);
- list.add((DiffEntry) old);
- list.add(de);
- map.put(id(de), list);
- } else if (old != null) {
- // Must be a list of DiffEntries
- ((List<DiffEntry>) old).add(de);
- map.put(id(de), old);
- }
- advanceOrCancel(pm);
- }
- return map;
- }
- private static String path(DiffEntry de) {
- return de.changeType == ChangeType.DELETE ? de.oldPath : de.newPath;
- }
- private static FileMode mode(DiffEntry de) {
- return de.changeType == ChangeType.DELETE ? de.oldMode : de.newMode;
- }
- private static AbbreviatedObjectId id(DiffEntry de) {
- return de.changeType == ChangeType.DELETE ? de.oldId : de.newId;
- }
- static boolean sameType(FileMode a, FileMode b) {
- // Files have to be of the same type in order to rename them.
- // We would never want to rename a file to a gitlink, or a
- // symlink to a file.
- //
- int aType = a.getBits() & FileMode.TYPE_MASK;
- int bType = b.getBits() & FileMode.TYPE_MASK;
- return aType == bType;
- }
- private static DiffEntry exactRename(DiffEntry src, DiffEntry dst) {
- return DiffEntry.pair(ChangeType.RENAME, src, dst, EXACT_RENAME_SCORE);
- }
- private static DiffEntry exactCopy(DiffEntry src, DiffEntry dst) {
- return DiffEntry.pair(ChangeType.COPY, src, dst, EXACT_RENAME_SCORE);
- }
- }