Merge branch 'master' into feature/uast-imports

ajnavarro · web-flow · commit 1e3ca76ed47b · 2019-08-21T12:05:13.000+02:00
diff --git a/.travis.yml b/.travis.yml
@@ -4,28 +4,14 @@ go_import_path: github.com/src-d/gitbase
 go: 1.12.x
 
 env:
-  - GO111MODULE=on
+  - GO111MODULE=on GOPROXY=https://proxy.golang.org
 
 matrix:
   fast_finish: true
-addons:
-  apt:
-    sources:
-    - ubuntu-toolchain-r-test
-    packages:
-    - gcc-6
-    - g++-6
-    - libonig-dev
-
-
-before_install:
-  - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-6 90
-  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 90
 
 before_script:
   - docker run -d --name bblfshd --privileged -p 9432:9432 bblfsh/bblfshd:v2.14.0-drivers
   - docker exec -it bblfshd bblfshctl driver list
-  - go get -v github.com/go-sql-driver/mysql/...
 
 script:
   - make test-coverage codecov
@@ -61,8 +47,6 @@ jobs:
         - echo "skipping before_script for macOS"
 
       script:
-        - brew update
-        - brew install oniguruma
         - make packages || echo "" # will fail because of docker being missing
         - if [ ! -f "build/gitbase_darwin_amd64/gitbase" ]; then echo "gitbase binary not generated" && exit 1; fi
         - cd build
diff --git a/Dockerfile b/Dockerfile
@@ -14,6 +14,7 @@ WORKDIR $GITBASE_PATH
 ENV GO_BUILD_ARGS="-o /bin/gitbase"
 ENV GO_BUILD_PATH="./cmd/gitbase"
 ENV GO111MODULE=on
+ENV GOPROXY=https://proxy.golang.org
 
 RUN make static-build
 
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -7,7 +7,7 @@ pipeline {
       nodeSelector 'srcd.host/type=jenkins-worker'
       containerTemplate {
         name 'regression-gitbase'
-        image 'srcd/regression-gitbase:v0.2.1'
+        image 'srcd/regression-gitbase:v0.3.1'
         ttyEnabled true
         command 'cat'
       }
@@ -17,13 +17,16 @@ pipeline {
     GOPATH = "/go"
     GO_IMPORT_PATH = "github.com/src-d/regression-gibase"
     GO_IMPORT_FULL_PATH = "${env.GOPATH}/src/${env.GO_IMPORT_PATH}"
+    GO111MODULE = "on"
+    PROM_ADDRESS = "http://prom-pushgateway-prometheus-pushgateway.monitoring.svc.cluster.local:9091"
+    PROM_JOB = "gitbase_perfomance"
   }
   triggers { pollSCM('0 0,12 * * *') }
   stages {
     stage('Run') {
       when { branch 'master' }
       steps {
-        sh '/bin/regression --complexity=2 --csv local:HEAD'
+        sh '/bin/regression --complexity=2 --csv --prom local:HEAD'
       }
     }
     stage('PR-run') {
diff --git a/docs/using-gitbase/examples.md b/docs/using-gitbase/examples.md
@@ -41,7 +41,7 @@ HAVING num > 1;
 ## Get the number of blobs per HEAD commit
 
 ```sql
-SELECT COUNT(commit_blob),
+SELECT COUNT(blob_hash),
        commit_hash
 FROM ref_commits
 NATURAL JOIN commits
@@ -137,7 +137,7 @@ CREATE INDEX files_lang_idx ON files USING pilosa (language(file_path, blob_cont
 DROP INDEX files_lang_idx ON files;
 ```
 
-## Calculating code line changes in the last commit
+## Calculating code line changes in the last commit
 
 This query will report how many lines of actual code (only code, not comments, blank lines or text) changed in the last commit of each repository.
 
@@ -166,10 +166,10 @@ The output will be similar to this:
 +-----------------+------------------+--------------------+
 ```
 
-## Calculating code line changes for files in the last commit
+## Calculating code line changes for files in the last commit
 
 This query will report how many lines of actual code (only code, not comments, blank lines or text) changed in each file of the last commit of each repository. It's similar to the previous example. `COMMIT_STATS` is an aggregation over the result of `COMMIT_FILE_STATS` so to speak.
-We will only report those files that whose language has been identified.
+We will only report those files whose language has been identified.
 
 ```sql
 SELECT
@@ -277,7 +277,7 @@ We'll get the following output:
 
 From this output, we can obtain some information about our query:
 - It's been running for 36 seconds.
-- It's querying commit_files table and has processed 8 out of 9 partitions.
+- It's querying `commit_files` table and has processed 8 out of 9 partitions.
 
 To kill a query that's currently running you can use the value in `Id`. If we were to kill the previous query, we would need to use the following query:
 
diff --git a/docs/using-gitbase/functions.md b/docs/using-gitbase/functions.md
@@ -6,19 +6,19 @@ To make some common tasks easier for the user, there are some functions to inter
 
 |     Name     |                                               Description                                                                      |
 |:-------------|:-------------------------------------------------------------------------------------------------------------------------------|
-|`commit_stats(repository_id, [from_commit_hash], to_commit_hash) json`|returns the stats between two commits for a repository. If from is empty, it will compare the given `to_commit_hash` with its parent commit. Vendored files stats are not included in the result of this function. This function is more thoroughly explained later in this document.|
-|`commit_file_stats(repository_id, [from_commit_hash], to_commit_hash) json array`|returns an array with the stats of each file in `to_commit_hash` since the given `from_commit_hash`. If from is not given, the parent commit will be used. Vendored files stats are not included in the result of this function. This function is more thoroughly explained later in this document.|
-|`is_remote(reference_name)bool`| check if the given reference name is from a remote one                                                          |
-|`is_tag(reference_name)bool`| check if the given reference name is a tag                                                                         |
-|`is_vendor(file_path)bool`| check if the given file name is a vendored file                                                                         |
-|`language(path, [blob])text`| gets the language of a file given its path and the optional content of the file                                    |
-|`uast(blob, [lang, [xpath]]) blob`| returns a node array of UAST nodes in semantic mode                                                          |
-|`uast_mode(mode, blob, lang) blob`| returns a node array of UAST nodes specifying its language and mode (semantic, annotated or native)          |
-|`uast_xpath(blob, xpath) blob`| performs an XPath query over the given UAST nodes                                                                |
-|`uast_extract(blob, key) text array`| extracts information identified by the given key from the uast nodes                                       |
-|`uast_children(blob) blob`| returns a flattened array of the children UAST nodes from each one of the UAST nodes in the given array              |
-|`loc(path, blob) json`| returns a JSON map, containing the lines of code of a file, separated in three categories: Code, Blank and Comment lines |
-|`version() text`| returns the gitbase version in the following format `8.0.11-{GITBASE_VERSION}` for compatibility with MySQL versioning |
+|`commit_stats(repository_id, [from_commit_hash], to_commit_hash) json`|returns the stats between two commits for a repository. If `from_commit_hash` is empty, it will compare the given `to_commit_hash` with its parent commit. Vendored files stats are not included in the result of this function. This function is more thoroughly explained later in this document.|
+|`commit_file_stats(repository_id, [from_commit_hash], to_commit_hash) json array`|returns an array with the stats of each file in `to_commit_hash` since the given `from_commit_hash`. If `from_commit_hash` is not given, the parent commit will be used. Vendored files stats are not included in the result of this function. This function is more thoroughly explained later in this document.|
+|`is_remote(reference_name)bool`| checks if the given reference name is from a remote one.                                                          |
+|`is_tag(reference_name)bool`| checks if the given reference name is a tag.                                                                         |
+|`is_vendor(file_path)bool`| checks if the given file name is a vendored file.                                                                         |
+|`language(path, [blob])text`| gets the language of a file given its path and the optional content of the file.                                    |
+|`uast(blob, [lang, [xpath]]) blob`| returns a node array of UAST nodes in semantic mode.                                                          |
+|`uast_mode(mode, blob, lang) blob`| returns a node array of UAST nodes specifying its language and mode (semantic, annotated or native).          |
+|`uast_xpath(blob, xpath) blob`| performs an XPath query over the given UAST nodes.                                                                |
+|`uast_extract(blob, key) text array`| extracts information identified by the given key from the uast nodes.                                       |
+|`uast_children(blob) blob`| returns a flattened array of the children UAST nodes from each one of the UAST nodes in the given array.              |
+|`loc(path, blob) json`| returns a JSON map, containing the lines of code of a file, separated in three categories: Code, Blank and Comment lines. |
+|`version() text`| returns the gitbase version in the following format `8.0.11-{GITBASE_VERSION}` for compatibility with MySQL versioning. |
 ## Standard functions
 
 These are all functions that are available because they are implemented in `go-mysql-server`, used by gitbase.
@@ -159,23 +159,29 @@ Check out the [UAST v2 specification](https://docs.sourced.tech/babelfish/uast/u
 
 Using these selectors as in,
 
-> uast_extract(nodes_column, @common_selector)
+```
+uast_extract(nodes_column, @common_selector)
+```
 
 you will extract the value of that property for each node.
 
 Nodes that have no value for the requested property will not be present in any way in the final array. That is, having a sequence of nodes `[node-1, node-2, node-3]` and knowing that node-2 doesn't have a value for the requested property, the returned array will be `[prop-1, prop-3]`.
 
 Also, if you want to retrieve values from a non common property, you can pass it directly
 
-> uast_extract(nodes_column, 'some-property')
+```
+uast_extract(nodes_column, 'some-property')
+```
 
 ## How to use `loc`
 
 `loc` will return statistics about the lines of code in a file, such as the code lines, comment lines, etc.
 
 It requires a file path and a file content.
 
-> loc(file_path, blob_content)
+```
+loc(file_path, blob_content)
+```
 
 The result of this function is a JSON document with the following shape:
 
@@ -266,9 +272,9 @@ FROM (
 
 It can be used in two ways:
 - To get the statistics of a specific commit `COMMIT_STATS(repository_id, commit_hash)`
-- To get the statistics of a the diff of a commit range `COMMIT_STATS(repository_id, from_commit, to_commit)`
+- To get the statistics of the diff of a commit range `COMMIT_STATS(repository_id, from_commit, to_commit)`
 
-`commit_stats` it's pretty much an aggregation of the result of `commit_file_stats`. While `commit_file_stats` has the stats for each file in a commit, `commit_stats` has the global stats of all files in the commit. As a result, it outputs a single structure instead of an array of them.
+`commit_stats` is pretty much an aggregation of the result of `commit_file_stats`. While `commit_file_stats` has the stats for each file in a commit, `commit_stats` has the global stats of all files in the commit. As a result, it outputs a single structure instead of an array of them.
 
 The shape of the result returned by this function is the following:
 
@@ -300,7 +306,7 @@ The shape of the result returned by this function is the following:
 
 **NOTE:** Files that are considered vendored files are ignored for the purpose of computing these statistics. Note that `.gitignore` is considered a vendored file.
 
-The result returned by this function is a JSON, which means to access its fields, the use of `JSON_EXTRACT is needed.
+The result returned by this function is a JSON, which means that to access its fields, the use of `JSON_EXTRACT` is needed.
 
 For example, code additions would be accessed like this:
 ```sql
diff --git a/docs/using-gitbase/indexes.md b/docs/using-gitbase/indexes.md
@@ -7,8 +7,8 @@ Indexes are implemented as bitmaps using [pilosa](https://github.com/pilosa/pilo
 Thus, to create indexes you must specify pilosa as the type of index. You can find some examples in the [examples](./examples.md#create-an-index-for-columns-on-a-table) section about managing indexes.
 
 Note that you can create an index either **on one or more columns** or **on a single expression**.
-In practice, having multiple indexes - one per column is better and more flexible than one index for multiple columns. It is because of data structures (bitmaps) used to represent index values.
-Even if you have one index on multiple columns, every columns is stored in independent _field_.
+In practice, having multiple indexes (one per column) is better and more flexible than one index for multiple columns. It is because of data structures (bitmaps) used to represent index values.
+Even if you have one index on multiple columns, every column is stored in an independent _field_.
 Merging those _fields_ by any logic operations is fast and much more flexible. The main difference of having multiple columns per index is, it internally calculates intersection across columns, so the index won't be used if you use _non_ `AND` operation in a filter, e.g.:
 
 With index on (`A`, `B`), the index will be used for following query:
@@ -26,4 +26,4 @@ and for the second query also two indexes will be used and the result will be a
 
 You can find some more examples in the [examples](./examples.md#create-an-index-for-columns-on-a-table) section.
 
-See [go-mysql-server](https://github.com/src-d/go-mysql-server/tree/541fde3b92093b3a449e803342a7a18c686275e6#indexes) documentation for more details
+See [go-mysql-server](https://github.com/src-d/go-mysql-server/tree/541fde3b92093b3a449e803342a7a18c686275e6#indexes) documentation for more details.
diff --git a/docs/using-gitbase/optimize-queries.md b/docs/using-gitbase/optimize-queries.md
@@ -2,7 +2,7 @@
 
 Even though in each release performance improvements are included to make gitbase faster, there are some queries that might take too long. By rewriting them in some ways, you can squeeze that extra performance you need by taking advantage of some optimisations that are already in place.
 
-There are two ways to optimize a gitbase query:
+There are three ways to optimize a gitbase query:
 - Create an index for some parts.
 - Making sure the joined tables are squashed.
 - Making sure not squashed joins are performed in memory.
@@ -82,7 +82,7 @@ So, as a good rule of thumb, the right side of an inner join should always be th
 The more obvious way to improve the performance of a query is to create an index for such query. Since you can index multiple columns or a single arbitrary expression, this may be useful for some kinds of queries. For example, if you're querying by language, you may want to index that so there is no need to compute the language each time.
 
 ```sql
-CREATE INDEX files_language_idx ON files USING pilosa (language(file_path, blob_content))
+CREATE INDEX files_language_idx ON files USING pilosa (language(file_path, blob_content));
 ```
 
 Once you have the index in place, gitbase only looks for the rows with the values matching your conditions.
@@ -199,37 +199,37 @@ This advice can be applied to all squashed tables, not only `repository_id`.
 
 This query will get squashed, because `NATURAL JOIN` makes sure all columns with equal names are used in the join.
 ```sql
-SELECT * FROM refs NATURAL JOIN ref_commits NATURAL JOIN commits
+SELECT * FROM refs NATURAL JOIN ref_commits NATURAL JOIN commits;
 ```
 
 This query, however, will not be squashed.
 ```sql
 SELECT * FROM refs r
 INNER JOIN ref_commits rc ON r.ref_name = rc.ref_name
-INNER JOIN commits c ON rc.commit_hash = c.commit_hash
+INNER JOIN commits c ON rc.commit_hash = c.commit_hash;
 ```
 
 **It requires some filters to be present in order to perform the squash.**
 
 This query will be squashed.
 
 ```sql
-SELECT * FROM commit_files NATURAL JOIN files
+SELECT * FROM commit_files NATURAL JOIN files;
 ```
 
 This query will not be squashed, as the join between `commit_files` and `files` requires more filters to be squashed.
 
 ```sql
 SELECT * FROM commit_files cf
-INNER JOIN files f ON cf.file_path = f.file_path
+INNER JOIN files f ON cf.file_path = f.file_path;
 ```
 
 **TIP:** we suggest always using `NATURAL JOIN` for joining tables, since it's less verbose and already satisfies all the filters for squashing tables.
 The only exception to this advice is when joining `refs` and `ref_commits`. A `NATURAL JOIN` between `refs` and `ref_commits` will only get the HEAD commit of the reference. The same happens with `commits` and `commit_trees`/`commit_files`.
 
 You can find the full list of conditions that need to be met for the squash to be applied [here](#list-of-filters-for-squashed-tables).
 
-**Only works if the tables joined follow a hierarchy.** Joinin `commits` and `files` does not work, or joining `blobs` with `files`. It needs to follow one of the hierarchies of tables.
+**Only works if the tables joined follow a hierarchy.** Joining `commits` and `files` does not work, or joining `blobs` with `files`. It needs to follow one of the hierarchies of tables.
 
 ```
 repositories -> refs -> ref_commits -> commits -> commit_trees -> tree_entries -> blobs
@@ -374,4 +374,4 @@ FROM (
 GROUP BY lang
 ```
 
-As a good rule of thumb: defer as much as possible GROUP BY and ORDER BY operations and only perform them with the minimum amount of data needed.
+As a good rule of thumb: defer as much as possible GROUP BY and ORDER BY operations and only perform them with the minimum amount of data needed.
diff --git a/docs/using-gitbase/schema.md b/docs/using-gitbase/schema.md
@@ -179,7 +179,7 @@ This table represents the relation between commits and [files](#files). Using th
 
 This table allow us to get the commit history from a specific reference name. `history_index` column represents the position of the commit from a specific reference.
 
-This table it's like the [log](https://git-scm.com/docs/git-log) from a specific reference.
+This table is like the [log](https://git-scm.com/docs/git-log) from a specific reference.
 
 Commits will be repeated if they are in several repositories or references.
 
diff --git a/docs/using-gitbase/supported-languages.md b/docs/using-gitbase/supported-languages.md
@@ -1,4 +1,4 @@
-## Supported languages
+# Supported languages
 
 Gitbase supports many programming languages depending on the use case.
 For instance the `language(path, [blob])` function supports all languages which [enry's package](https://github.com/src-d/enry) can autodetect.
@@ -14,4 +14,4 @@ If your use case requires _Universal Abstract Syntax Tree_ then most likely one
 
 The _UAST_ functions support programming languages which already have implemented [babelfish](https://docs.sourced.tech/babelfish) driver.
 The list of currently supported languages on babelfish, you can find [here](https://docs.sourced.tech/babelfish/languages#supported-languages).
-Drivers which are still in development can be find [here](https://docs.sourced.tech/babelfish/languages#in-development).
+Drivers which are still in development can be found [here](https://docs.sourced.tech/babelfish/languages#in-development).