Change language statistics to save size instead of percentage (#11681)
* Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									4395c607ed
								
							
						
					
					
						commit
						ea4c139cd2
					
				
					 5 changed files with 150 additions and 38 deletions
				
			
		| 
						 | 
				
			
			@ -212,6 +212,8 @@ var migrations = []Migration{
 | 
			
		|||
	NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn),
 | 
			
		||||
	// v139 -> v140
 | 
			
		||||
	NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs),
 | 
			
		||||
	// v140 -> v141
 | 
			
		||||
	NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize),
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// GetCurrentDBVersion returns the current db version
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										56
									
								
								models/migrations/v140.go
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								models/migrations/v140.go
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,56 @@
 | 
			
		|||
// Copyright 2020 The Gitea Authors. All rights reserved.
 | 
			
		||||
// Use of this source code is governed by a MIT-style
 | 
			
		||||
// license that can be found in the LICENSE file.
 | 
			
		||||
 | 
			
		||||
package migrations
 | 
			
		||||
 | 
			
		||||
import (
 | 
			
		||||
	"fmt"
 | 
			
		||||
 | 
			
		||||
	"code.gitea.io/gitea/modules/setting"
 | 
			
		||||
 | 
			
		||||
	"xorm.io/xorm"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
func fixLanguageStatsToSaveSize(x *xorm.Engine) error {
 | 
			
		||||
	// LanguageStat see models/repo_language_stats.go
 | 
			
		||||
	type LanguageStat struct {
 | 
			
		||||
		Size int64 `xorm:"NOT NULL DEFAULT 0"`
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// RepoIndexerType specifies the repository indexer type
 | 
			
		||||
	type RepoIndexerType int
 | 
			
		||||
 | 
			
		||||
	const (
 | 
			
		||||
		// RepoIndexerTypeCode code indexer
 | 
			
		||||
		RepoIndexerTypeCode RepoIndexerType = iota // 0
 | 
			
		||||
		// RepoIndexerTypeStats repository stats indexer
 | 
			
		||||
		RepoIndexerTypeStats // 1
 | 
			
		||||
	)
 | 
			
		||||
 | 
			
		||||
	// RepoIndexerStatus see models/repo_indexer.go
 | 
			
		||||
	type RepoIndexerStatus struct {
 | 
			
		||||
		IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"`
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if err := x.Sync2(new(LanguageStat)); err != nil {
 | 
			
		||||
		return fmt.Errorf("Sync2: %v", err)
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats})
 | 
			
		||||
 | 
			
		||||
	// Delete language stat statuses
 | 
			
		||||
	truncExpr := "TRUNCATE TABLE"
 | 
			
		||||
	if setting.Database.UseSQLite3 {
 | 
			
		||||
		truncExpr = "DELETE FROM"
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Delete language stats
 | 
			
		||||
	if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	sess := x.NewSession()
 | 
			
		||||
	defer sess.Close()
 | 
			
		||||
	return dropTableColumns(sess, "language_stat", "percentage")
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -20,11 +20,28 @@ type LanguageStat struct {
 | 
			
		|||
	CommitID    string
 | 
			
		||||
	IsPrimary   bool
 | 
			
		||||
	Language    string             `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"`
 | 
			
		||||
	Percentage  float32            `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"`
 | 
			
		||||
	Percentage  float32            `xorm:"-"`
 | 
			
		||||
	Size        int64              `xorm:"NOT NULL DEFAULT 0"`
 | 
			
		||||
	Color       string             `xorm:"-"`
 | 
			
		||||
	CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"`
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// specialLanguages defines list of languages that are excluded from the calculation
 | 
			
		||||
// unless they are the only language present in repository. Only languages which under
 | 
			
		||||
// normal circumstances are not considered to be code should be listed here.
 | 
			
		||||
var specialLanguages = map[string]struct{}{
 | 
			
		||||
	"XML":      {},
 | 
			
		||||
	"JSON":     {},
 | 
			
		||||
	"TOML":     {},
 | 
			
		||||
	"YAML":     {},
 | 
			
		||||
	"INI":      {},
 | 
			
		||||
	"SQL":      {},
 | 
			
		||||
	"SVG":      {},
 | 
			
		||||
	"Text":     {},
 | 
			
		||||
	"Markdown": {},
 | 
			
		||||
	"other":    {},
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// LanguageStatList defines a list of language statistics
 | 
			
		||||
type LanguageStatList []*LanguageStat
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -34,12 +51,53 @@ func (stats LanguageStatList) loadAttributes() {
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (stats LanguageStatList) getLanguagePercentages() map[string]float32 {
 | 
			
		||||
	langPerc := make(map[string]float32)
 | 
			
		||||
	var otherPerc float32 = 100
 | 
			
		||||
	var total int64
 | 
			
		||||
	// Check that repository has at least one non-special language
 | 
			
		||||
	var skipSpecial bool
 | 
			
		||||
	for _, stat := range stats {
 | 
			
		||||
		if _, ok := specialLanguages[stat.Language]; !ok {
 | 
			
		||||
			skipSpecial = true
 | 
			
		||||
			break
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	for _, stat := range stats {
 | 
			
		||||
		// Exclude specific languages from percentage calculation
 | 
			
		||||
		if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		total += stat.Size
 | 
			
		||||
	}
 | 
			
		||||
	if total > 0 {
 | 
			
		||||
		for _, stat := range stats {
 | 
			
		||||
			// Exclude specific languages from percentage calculation
 | 
			
		||||
			if _, ok := specialLanguages[stat.Language]; ok && skipSpecial {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
			perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10)
 | 
			
		||||
			if perc <= 0.1 {
 | 
			
		||||
				continue
 | 
			
		||||
			}
 | 
			
		||||
			otherPerc -= perc
 | 
			
		||||
			langPerc[stat.Language] = perc
 | 
			
		||||
		}
 | 
			
		||||
		otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
 | 
			
		||||
	} else {
 | 
			
		||||
		otherPerc = 100
 | 
			
		||||
	}
 | 
			
		||||
	if otherPerc > 0 {
 | 
			
		||||
		langPerc["other"] = otherPerc
 | 
			
		||||
	}
 | 
			
		||||
	return langPerc
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) {
 | 
			
		||||
	stats := make(LanguageStatList, 0, 6)
 | 
			
		||||
	if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil {
 | 
			
		||||
	if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	stats.loadAttributes()
 | 
			
		||||
	return stats, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -54,13 +112,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
 | 
			
		|||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
	perc := stats.getLanguagePercentages()
 | 
			
		||||
	topstats := make(LanguageStatList, 0, limit)
 | 
			
		||||
	var other float32
 | 
			
		||||
	for i := range stats {
 | 
			
		||||
		if stats[i].Language == "other" || len(topstats) >= limit {
 | 
			
		||||
			other += stats[i].Percentage
 | 
			
		||||
		if _, ok := perc[stats[i].Language]; !ok {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		if stats[i].Language == "other" || len(topstats) >= limit {
 | 
			
		||||
			other += perc[stats[i].Language]
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		stats[i].Percentage = perc[stats[i].Language]
 | 
			
		||||
		topstats = append(topstats, stats[i])
 | 
			
		||||
	}
 | 
			
		||||
	if other > 0 {
 | 
			
		||||
| 
						 | 
				
			
			@ -71,11 +134,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error)
 | 
			
		|||
			Percentage: float32(math.Round(float64(other)*10) / 10),
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
	topstats.loadAttributes()
 | 
			
		||||
	return topstats, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// UpdateLanguageStats updates the language statistics for repository
 | 
			
		||||
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error {
 | 
			
		||||
func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error {
 | 
			
		||||
	sess := x.NewSession()
 | 
			
		||||
	if err := sess.Begin(); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
| 
						 | 
				
			
			@ -87,15 +151,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
 | 
			
		|||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	var topLang string
 | 
			
		||||
	var p float32
 | 
			
		||||
	for lang, perc := range stats {
 | 
			
		||||
		if perc > p {
 | 
			
		||||
			p = perc
 | 
			
		||||
	var s int64
 | 
			
		||||
	for lang, size := range stats {
 | 
			
		||||
		if size > s {
 | 
			
		||||
			s = size
 | 
			
		||||
			topLang = strings.ToLower(lang)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for lang, perc := range stats {
 | 
			
		||||
	for lang, size := range stats {
 | 
			
		||||
		upd := false
 | 
			
		||||
		llang := strings.ToLower(lang)
 | 
			
		||||
		for _, s := range oldstats {
 | 
			
		||||
| 
						 | 
				
			
			@ -103,8 +167,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
 | 
			
		|||
			if strings.ToLower(s.Language) == llang {
 | 
			
		||||
				s.CommitID = commitID
 | 
			
		||||
				s.IsPrimary = llang == topLang
 | 
			
		||||
				s.Percentage = perc
 | 
			
		||||
				if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil {
 | 
			
		||||
				s.Size = size
 | 
			
		||||
				if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil {
 | 
			
		||||
					return err
 | 
			
		||||
				}
 | 
			
		||||
				upd = true
 | 
			
		||||
| 
						 | 
				
			
			@ -114,11 +178,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl
 | 
			
		|||
		// Insert new language
 | 
			
		||||
		if !upd {
 | 
			
		||||
			if _, err := sess.Insert(&LanguageStat{
 | 
			
		||||
				RepoID:     repo.ID,
 | 
			
		||||
				CommitID:   commitID,
 | 
			
		||||
				IsPrimary:  llang == topLang,
 | 
			
		||||
				Language:   lang,
 | 
			
		||||
				Percentage: perc,
 | 
			
		||||
				RepoID:    repo.ID,
 | 
			
		||||
				CommitID:  commitID,
 | 
			
		||||
				IsPrimary: llang == topLang,
 | 
			
		||||
				Language:  lang,
 | 
			
		||||
				Size:      size,
 | 
			
		||||
			}); err != nil {
 | 
			
		||||
				return err
 | 
			
		||||
			}
 | 
			
		||||
| 
						 | 
				
			
			@ -153,7 +217,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error {
 | 
			
		|||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	RepoLang := make(LanguageStatList, 0, 6)
 | 
			
		||||
	if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil {
 | 
			
		||||
	if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil {
 | 
			
		||||
		return err
 | 
			
		||||
	}
 | 
			
		||||
	if len(RepoLang) > 0 {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -8,7 +8,6 @@ import (
 | 
			
		|||
	"bytes"
 | 
			
		||||
	"io"
 | 
			
		||||
	"io/ioutil"
 | 
			
		||||
	"math"
 | 
			
		||||
 | 
			
		||||
	"code.gitea.io/gitea/modules/analyze"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -21,7 +20,7 @@ import (
 | 
			
		|||
const fileSizeLimit int64 = 16 * 1024 * 1024
 | 
			
		||||
 | 
			
		||||
// GetLanguageStats calculates language stats for git repository at specified commit
 | 
			
		||||
func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) {
 | 
			
		||||
func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) {
 | 
			
		||||
	r, err := git.PlainOpen(repo.Path)
 | 
			
		||||
	if err != nil {
 | 
			
		||||
		return nil, err
 | 
			
		||||
| 
						 | 
				
			
			@ -43,7 +42,6 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
 | 
			
		|||
	}
 | 
			
		||||
 | 
			
		||||
	sizes := make(map[string]int64)
 | 
			
		||||
	var total int64
 | 
			
		||||
	err = tree.Files().ForEach(func(f *object.File) error {
 | 
			
		||||
		if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
 | 
			
		||||
			enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
 | 
			
		||||
| 
						 | 
				
			
			@ -60,11 +58,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
 | 
			
		|||
 | 
			
		||||
		language := analyze.GetCodeLanguage(f.Name, content)
 | 
			
		||||
		if language == enry.OtherLanguage || language == "" {
 | 
			
		||||
			return nil
 | 
			
		||||
			language = "other"
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		sizes[language] += f.Size
 | 
			
		||||
		total += f.Size
 | 
			
		||||
 | 
			
		||||
		return nil
 | 
			
		||||
	})
 | 
			
		||||
| 
						 | 
				
			
			@ -72,21 +69,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
 | 
			
		|||
		return nil, err
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	stats := make(map[string]float32)
 | 
			
		||||
	var otherPerc float32 = 100
 | 
			
		||||
	for language, size := range sizes {
 | 
			
		||||
		perc := float32(math.Round(float64(size)/float64(total)*1000) / 10)
 | 
			
		||||
		if perc <= 0.1 {
 | 
			
		||||
			continue
 | 
			
		||||
		}
 | 
			
		||||
		otherPerc -= perc
 | 
			
		||||
		stats[language] = perc
 | 
			
		||||
	if len(sizes) == 0 {
 | 
			
		||||
		sizes["other"] = 0
 | 
			
		||||
	}
 | 
			
		||||
	otherPerc = float32(math.Round(float64(otherPerc)*10) / 10)
 | 
			
		||||
	if otherPerc > 0 {
 | 
			
		||||
		stats["other"] = otherPerc
 | 
			
		||||
	}
 | 
			
		||||
	return stats, nil
 | 
			
		||||
 | 
			
		||||
	return sizes, nil
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func readFile(f *object.File, limit int64) ([]byte, error) {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -34,6 +34,9 @@ func TestRepoStatsIndex(t *testing.T) {
 | 
			
		|||
 | 
			
		||||
	repo, err := models.GetRepositoryByID(1)
 | 
			
		||||
	assert.NoError(t, err)
 | 
			
		||||
	status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats)
 | 
			
		||||
	assert.NoError(t, err)
 | 
			
		||||
	assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha)
 | 
			
		||||
	langs, err := repo.GetTopLanguageStats(5)
 | 
			
		||||
	assert.NoError(t, err)
 | 
			
		||||
	assert.Len(t, langs, 1)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue