Merge different languages for language stats (#24900)
Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
This commit is contained in:
		
							parent
							
								
									63d5e762d8
								
							
						
					
					
						commit
						395bb33e4c
					
				
					 5 changed files with 59 additions and 6 deletions
				
			
		| 
						 | 
				
			
			@ -3,7 +3,46 @@
 | 
			
		|||
 | 
			
		||||
package git
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"strings"
 | 
			
		||||
	"unicode"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	fileSizeLimit int64 = 16 * 1024   // 16 KiB
 | 
			
		||||
	bigFileSize   int64 = 1024 * 1024 // 1 MiB
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
 | 
			
		||||
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
 | 
			
		||||
	names := map[string]struct {
 | 
			
		||||
		uniqueName string
 | 
			
		||||
		upperCount int
 | 
			
		||||
	}{}
 | 
			
		||||
 | 
			
		||||
	countUpper := func(s string) (count int) {
 | 
			
		||||
		for _, r := range s {
 | 
			
		||||
			if unicode.IsUpper(r) {
 | 
			
		||||
				count++
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		return count
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for name := range stats {
 | 
			
		||||
		cnt := countUpper(name)
 | 
			
		||||
		lower := strings.ToLower(name)
 | 
			
		||||
		if cnt >= names[lower].upperCount {
 | 
			
		||||
			names[lower] = struct {
 | 
			
		||||
				uniqueName string
 | 
			
		||||
				upperCount int
 | 
			
		||||
			}{uniqueName: name, upperCount: cnt}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	res := make(map[string]int64, len(names))
 | 
			
		||||
	for name, num := range stats {
 | 
			
		||||
		res[names[strings.ToLower(name)].uniqueName] += num
 | 
			
		||||
	}
 | 
			
		||||
	return res
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 | 
			
		|||
		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return sizes, nil
 | 
			
		||||
	return mergeLanguageStats(sizes), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func readFile(f *object.File, limit int64) ([]byte, error) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 | 
			
		|||
		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
 | 
			
		||||
		// - eg. do the all the detection tests using filename first before reading content.
 | 
			
		||||
		language := analyze.GetCodeLanguage(f.Name(), content)
 | 
			
		||||
		if language == enry.OtherLanguage || language == "" {
 | 
			
		||||
		if language == "" {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 | 
			
		|||
 | 
			
		||||
		included, checked := includedLanguage[language]
 | 
			
		||||
		if !checked {
 | 
			
		||||
			langtype := enry.GetLanguageType(language)
 | 
			
		||||
			included = langtype == enry.Programming || langtype == enry.Markup
 | 
			
		||||
			langType := enry.GetLanguageType(language)
 | 
			
		||||
			included = langType == enry.Programming || langType == enry.Markup
 | 
			
		||||
			includedLanguage[language] = included
 | 
			
		||||
		}
 | 
			
		||||
		if included {
 | 
			
		||||
| 
						 | 
				
			
			@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 | 
			
		|||
		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return sizes, nil
 | 
			
		||||
	return mergeLanguageStats(sizes), nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func discardFull(rd *bufio.Reader, discard int64) error {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
 | 
			
		|||
		"Java":   112,
 | 
			
		||||
	}, stats)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestMergeLanguageStats(t *testing.T) {
 | 
			
		||||
	assert.EqualValues(t, map[string]int64{
 | 
			
		||||
		"PHP":    1,
 | 
			
		||||
		"python": 10,
 | 
			
		||||
		"JAVA":   700,
 | 
			
		||||
	}, mergeLanguageStats(map[string]int64{
 | 
			
		||||
		"PHP":    1,
 | 
			
		||||
		"python": 10,
 | 
			
		||||
		"Java":   100,
 | 
			
		||||
		"java":   200,
 | 
			
		||||
		"JAVA":   400,
 | 
			
		||||
	}))
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -10,7 +10,7 @@ import (
 | 
			
		|||
 | 
			
		||||
// FallbackErrorf is the last chance to show an error if the logger has internal errors
 | 
			
		||||
func FallbackErrorf(format string, args ...any) {
 | 
			
		||||
	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args)
 | 
			
		||||
	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args...)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func GetLevel() Level {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue