guix-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH]: Rewrite CRAN importer.


From: Ricardo Wurmus
Subject: Re: [PATCH]: Rewrite CRAN importer.
Date: Thu, 10 Dec 2015 16:12:53 +0100

Ludovic Courtès <address@hidden> writes:

>> +(define (safe-car maybe-pair)
>
> Does it have airbags?

Rounded corners, no explosive combustion engine, and wrapped in an extra
thick layer of fluff!

>> +  (if (or (null? maybe-pair)
>> +          (not maybe-pair))
>> +      #f
>> +      (car maybe-pair)))
>
> Seriously, I think this is the wrong way to deal with that.

Okay.  I’m using ‘match’ instead.  It’s also shorter and clearer.

>> +(define (package->cran-name package)
>> +  "Given a Guix PACKAGE value, return the name of the R package on CRAN."
>> +  (let* ((source-url (and=> (package-source package)
>> +                            (compose safe-car origin-uri)))
>
> I would change it to:
>
>   (match (package-source package)
>     ((? origin? origin)
>      (match (origin-uri origin)
>        ((url rest ...)
>         (let ((end (string-rindex url #\_)) …) …))
>        (_               #f)))
>     (_ #f))
>
> This is more verbose, but it makes the intent clearer IMO, and avoids
> “wrong-type-arg #f” errors that would arise with the proposed code.

Yes, this is better.  I’ve also made this private as it is not needed
anywhere else.

>> +  (let ((url (string-append %cran-url name "/DESCRIPTION")))
>> +    (call-with-temporary-output-file
>> +     (lambda (temp port)
>> +       (and (url-fetch url temp)
>> +            (call-with-input-file temp
>> +              (compose description->alist read-string)))))))
>
> I think it’s best to use ‘http-fetch’ or ‘http-fetch/cached’ from (guix
> http-client), which does not necessitate the creation of a temporary
> file.

Okay.

Attached is a new patch (I didn’t resend the first patch adding new
license conversions).

>From 7f90add2b33707178964364c86f8a4440e6bf554 Mon Sep 17 00:00:00 2001
From: Ricardo Wurmus <address@hidden>
Date: Thu, 3 Dec 2015 16:12:09 +0100
Subject: [PATCH] import: cran: Parse DESCRIPTION instead of HTML.

* guix/import/cran.scm (description->alist, listify,
  beautify-description, description->package): New procedures.
(table-datum, downloads->url, nodes->text, cran-sxml->sexp): Remove
proceduces.
(latest-release): Use parsed DESCRIPTION instead of SXML.
* tests/cran.scm: Rewrite to match importer.
---
 guix/import/cran.scm | 265 +++++++++++++++++++++++++--------------------------
 tests/cran.scm       | 189 +++++++++++++++---------------------
 2 files changed, 209 insertions(+), 245 deletions(-)

diff --git a/guix/import/cran.scm b/guix/import/cran.scm
index 43dc2c8..ff53a40 100644
--- a/guix/import/cran.scm
+++ b/guix/import/cran.scm
@@ -20,26 +20,26 @@
 (define-module (guix import cran)
   #:use-module (ice-9 match)
   #:use-module (ice-9 regex)
+  #:use-module ((ice-9 rdelim) #:select (read-string))
   #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-26)
-  #:use-module (sxml simple)
-  #:use-module (sxml match)
-  #:use-module (sxml xpath)
   #:use-module (guix http-client)
   #:use-module (guix hash)
   #:use-module (guix store)
   #:use-module (guix base32)
   #:use-module ((guix download) #:select (download-to-store))
   #:use-module (guix import utils)
+  #:use-module ((guix build-system r) #:select (cran-uri))
   #:use-module (guix upstream)
   #:use-module (guix packages)
+  #:use-module (gnu packages)
   #:export (cran->guix-package
             %cran-updater))
 
 ;;; Commentary:
 ;;;
 ;;; Generate a package declaration template for the latest version of an R
-;;; package on CRAN, using the HTML description downloaded from
+;;; package on CRAN, using the DESCRIPTION file downloaded from
 ;;; cran.r-project.org.
 ;;;
 ;;; Code:
@@ -67,6 +67,31 @@
    ((lst ...) `(list ,@(map string->license lst)))
    (_ #f)))
 
+
+(define (description->alist description)
+  "Convert a DESCRIPTION string into an alist."
+  (let ((lines (string-split description #\newline))
+        (parse (lambda (line acc)
+                 (if (string-null? line) acc
+                     ;; Keys usually start with a capital letter and end with
+                     ;; ":".  There are some exceptions, unfortunately (such
+                     ;; as "biocViews").  There are no blanks in a key.
+                     (if (string-match "^[A-Za-z][^ :]+:( |\n|$)" line)
+                         ;; New key/value pair
+                         (let* ((pos   (string-index line #\:))
+                                (key   (string-take line pos))
+                                (value (string-drop line (+ 1 pos))))
+                           (cons (cons key
+                                       (string-trim-both value))
+                                 acc))
+                         ;; This is a continuation of the previous pair
+                         (match-let ((((key . value) . rest) acc))
+                           (cons (cons key (string-join
+                                            (list value
+                                                  (string-trim-both line))))
+                                 rest)))))))
+    (fold parse '() lines)))
+
 (define (format-inputs names)
   "Generate a sorted list of package inputs from a list of package NAMES."
   (map (lambda (name)
@@ -82,125 +107,94 @@ package definition."
     ((package-inputs ...)
      `((,type (,'quasiquote ,(format-inputs package-inputs)))))))
 
-(define (table-datum tree label)
-  "Extract the datum node following a LABEL in the sxml table TREE.  Only the
-first cell of a table row is considered a label cell."
-  ((node-pos 1)
-   ((sxpath `(xhtml:tr
-              (xhtml:td 1)        ; only first cell can contain label
-              (equal? ,label)
-              ,(node-parent tree) ; go up to label cell
-              ,(node-parent tree) ; go up to matching row
-              (xhtml:td 2)))      ; select second cell
-    tree)))
-
 (define %cran-url "http://cran.r-project.org/web/packages/";)
 
 (define (cran-fetch name)
-  "Return an sxml representation of the CRAN page for the R package NAME,
-or #f on failure.  NAME is case-sensitive."
+  "Return an alist of the contents of the DESCRIPTION file for the R package
+NAME, or #f on failure.  NAME is case-sensitive."
   ;; This API always returns the latest release of the module.
-  (let ((cran-url (string-append %cran-url name "/")))
-    (false-if-exception
-     (xml->sxml (http-fetch cran-url)
-                #:trim-whitespace? #t
-                #:namespaces '((xhtml . "http://www.w3.org/1999/xhtml";))
-                #:default-entity-handler
-                (lambda (port name)
-                  (case name
-                    ((nbsp) " ")
-                    ((ge) ">=")
-                    ((gt) ">")
-                    ((lt) "<")
-                    (else
-                     (format (current-warning-port)
-                             "~a:~a:~a: undefined entitity: ~a\n"
-                             cran-url (port-line port) (port-column port)
-                             name)
-                     (symbol->string name))))))))
-
-(define (downloads->url downloads)
-  "Extract from DOWNLOADS, the downloads item of the CRAN sxml tree, the
-download URL."
-  (string-append "mirror://cran/"
-                 ;; Remove double dots, because we want an
-                 ;; absolute path.
-                 (regexp-substitute/global
-                  #f "\\.\\./"
-                  (string-join ((sxpath '((xhtml:a 1) @ href *text*))
-                                (table-datum downloads " Package source: ")))
-                  'pre 'post)))
-
-(define (nodes->text nodeset)
-  "Return the concatenation of the text nodes among NODESET."
-  (string-join ((sxpath '(// *text*)) nodeset) " "))
-
-(define (cran-sxml->sexp sxml)
-  "Return the `package' s-expression for a CRAN package from the SXML
-representation of the package page."
+  (let ((url (string-append %cran-url name "/DESCRIPTION")))
+    ((compose description->alist read-string) (http-fetch url))))
+
+(define (listify meta field)
+  "Look up FIELD in the alist META.  If FIELD contains a comma-separated
+string, turn it into a list and strip off parenthetic expressions.  Return the
+empty list when the FIELD cannot be found."
+  (let ((value (assoc-ref meta field)))
+    (if (not value)
+        '()
+        ;; Strip off parentheses
+        (let ((items (string-split (regexp-substitute/global
+                                    #f "( *\\([^\\)]+\\)) *"
+                                    value 'pre 'post)
+                                   #\,)))
+          ;; When there is whitespace inside of items it is probably because
+          ;; this was not an actual list to begin with.
+          (remove (cut string-any char-set:whitespace <>)
+                  (map string-trim-both items))))))
+
+(define (beautify-description description)
+  "Improve the package DESCRIPTION by turning a beginning sentence fragment
+into a proper sentence and by using two spaces between sentences."
+  (let ((cleaned (if (string-prefix? "A " description)
+                     (string-append "This package provides a"
+                                    (substring description 1))
+                     description)))
+    ;; Use double spacing between sentences
+    (regexp-substitute/global #f "\\. \\b"
+                              cleaned 'pre ".  " 'post)))
+
+(define (description->package meta)
+  "Return the `package' s-expression for a CRAN package from the alist META,
+which was derived from the R package's DESCRIPTION file."
   (define (guix-name name)
     (if (string-prefix? "r-" name)
         (string-downcase name)
         (string-append "r-" (string-downcase name))))
 
-  (sxml-match-let*
-   (((*TOP* (xhtml:html
-             ,head
-             (xhtml:body
-              (xhtml:h2 ,name-and-synopsis)
-              (xhtml:p ,description)
-              ,summary
-              (xhtml:h4 "Downloads:") ,downloads
-              . ,rest)))
-     sxml))
-   (let* ((name       (match:prefix (string-match ": " name-and-synopsis)))
-          (synopsis   (match:suffix (string-match ": " name-and-synopsis)))
-          (version    (nodes->text (table-datum summary "Version:")))
-          (license    ((compose string->license nodes->text)
-                       (table-datum summary "License:")))
-          (home-page  (nodes->text ((sxpath '((xhtml:a 1)))
-                                    (table-datum summary "URL:"))))
-          (source-url (downloads->url downloads))
-          (tarball    (with-store store (download-to-store store source-url)))
-          (sysdepends (map match:substring
-                           (list-matches
-                            "[^ ]+"
-                            ;; Strip off comma and parenthetical
-                            ;; expressions.
-                            (regexp-substitute/global
-                             #f "(,|\\([^\\)]+\\))"
-                             (nodes->text (table-datum summary
-                                                       "SystemRequirements:"))
-                             'pre 'post))))
-          (imports    (map guix-name
-                           ((sxpath '(// xhtml:a *text*))
-                            (table-datum summary "Imports:")))))
-     `(package
-        (name ,(guix-name name))
-        (version ,version)
-        (source (origin
-                  (method url-fetch)
-                  (uri (cran-uri ,name version))
-                  (sha256
-                   (base32
-                    ,(bytevector->nix-base32-string (file-sha256 tarball))))))
-        (build-system r-build-system)
-        ,@(maybe-inputs sysdepends)
-        ,@(maybe-inputs imports 'propagated-inputs)
-        (home-page ,(if (string-null? home-page)
-                        (string-append %cran-url name)
-                        home-page))
-        (synopsis ,synopsis)
-        ;; Use double spacing
-        (description ,(regexp-substitute/global #f "\\. \\b" description
-                                                'pre ".  " 'post))
-        (license ,license)))))
+  (let* ((name       (assoc-ref meta "Package"))
+         (synopsis   (assoc-ref meta "Title"))
+         (version    (assoc-ref meta "Version"))
+         (license    (string->license (assoc-ref meta "License")))
+         ;; Some packages have multiple home pages.  Some have none.
+         (home-page  (match (listify meta "URL")
+                       ((url rest ...) url)
+                       (_ (string-append %cran-url name))))
+         (source-url (match (cran-uri name version)
+                       ((url rest ...) url)
+                       (_ #f)))
+         (tarball    (with-store store (download-to-store store source-url)))
+         (sysdepends (map string-downcase (listify meta "SystemRequirements")))
+         (propagate  (map guix-name (lset-union equal?
+                                                (listify meta "Imports")
+                                                (listify meta "LinkingTo")
+                                                (delete "R"
+                                                        (listify meta 
"Depends"))))))
+    `(package
+       (name ,(guix-name name))
+       (version ,version)
+       (source (origin
+                 (method url-fetch)
+                 (uri (cran-uri ,name version))
+                 (sha256
+                  (base32
+                   ,(bytevector->nix-base32-string (file-sha256 tarball))))))
+       (properties ,`(,'quasiquote ((,'upstream-name . ,name))))
+       (build-system r-build-system)
+       ,@(maybe-inputs sysdepends)
+       ,@(maybe-inputs propagate 'propagated-inputs)
+       (home-page ,(if (string-null? home-page)
+                       (string-append %cran-url name)
+                       home-page))
+       (synopsis ,synopsis)
+       (description ,(beautify-description (assoc-ref meta "Description")))
+       (license ,license))))
 
 (define (cran->guix-package package-name)
   "Fetch the metadata for PACKAGE-NAME from cran.r-project.org, and return the
 `package' s-expression corresponding to that package, or #f on failure."
   (let ((module-meta (cran-fetch package-name)))
-    (and=> module-meta cran-sxml->sexp)))
+    (and=> module-meta description->package)))
 
 
 ;;;
@@ -209,32 +203,33 @@ representation of the package page."
 
 (define (latest-release package)
   "Return an <upstream-source> for the latest release of PACKAGE."
-  (define name
-    (if (string-prefix? "r-" package)
-        (string-drop package 2)
-        package))
-
-  (define sxml
-    (cran-fetch name))
-
-  (and sxml
-       (sxml-match-let*
-        (((*TOP* (xhtml:html
-                  ,head
-                  (xhtml:body
-                   (xhtml:h2 ,name-and-synopsis)
-                   (xhtml:p ,description)
-                   ,summary
-                   (xhtml:h4 "Downloads:") ,downloads
-                   . ,rest)))
-          sxml))
-        (let ((version (nodes->text (table-datum summary "Version:")))
-              (url     (downloads->url downloads)))
-          ;; CRAN does not provide signatures.
-          (upstream-source
-           (package package)
-           (version version)
-           (urls (list url)))))))
+
+  (define (package->cran-name package)
+    (match (package-source package)
+      ((? origin? origin)
+       (match (origin-uri origin)
+         ((url rest ...)
+          (let ((end   (string-rindex url #\_))
+                (start (string-rindex url #\/)))
+            ;; The URL ends on
+            ;; (string-append "/" name "_" version ".tar.gz")
+            (substring url start end)))
+         (_ #f)))
+    (_ #f)))
+
+  (define cran-name
+    (package->cran-name (specification->package package)))
+
+  (define meta
+    (cran-fetch cran-name))
+
+  (and meta
+       (let ((version (assoc-ref meta "Version")))
+         ;; CRAN does not provide signatures.
+         (upstream-source
+          (package package)
+          (version version)
+          (urls (cran-uri cran-name version))))))
 
 (define (cran-package? package)
   "Return true if PACKAGE is an R package from CRAN."
diff --git a/tests/cran.scm b/tests/cran.scm
index ba5699a..0a4a2fd 100644
--- a/tests/cran.scm
+++ b/tests/cran.scm
@@ -19,120 +19,84 @@
 (define-module (test-cran)
   #:use-module (guix import cran)
   #:use-module (guix tests)
+  #:use-module (srfi srfi-1)
   #:use-module (srfi srfi-64)
+  #:use-module (srfi srfi-26)
   #:use-module (ice-9 match))
 
-(define sxml
-  '(*TOP* (xhtml:html
-           (xhtml:head
-            (xhtml:title "CRAN - Package my-example-sxml"))
-           (xhtml:body
-            (xhtml:h2 "my-example-sxml: Short description")
-            (xhtml:p "Long description")
-            (xhtml:table
-             (@ (summary "Package my-example-sxml summary"))
-             (xhtml:tr
-              (xhtml:td "Version:")
-              (xhtml:td "1.2.3"))
-             (xhtml:tr
-              (xhtml:td "Depends:")
-              (xhtml:td "R (>= 3.1.0)"))
-             (xhtml:tr
-              (xhtml:td "SystemRequirements:")
-              (xhtml:td "cairo (>= 1.2 http://www.cairographics.org/)"))
-             (xhtml:tr
-              (xhtml:td "Imports:")
-              (xhtml:td
-               (xhtml:a (@ (href "../scales/index.html"))
-                        "scales")
-               " (>= 0.2.3), "
-               (xhtml:a (@ (href "../proto/index.html"))
-                        "proto")
-               ", "
-               (xhtml:a (@ (href "../Rcpp/index.html")) "Rcpp")
-               " (>= 0.11.0)"))
-             (xhtml:tr
-              (xhtml:td "Suggests:")
-              (xhtml:td
-               (xhtml:a (@ (href "../some/index.html"))
-                        "some")
-               ", "
-               (xhtml:a (@ (href "../suggestions/index.html"))
-                        "suggestions")))
-             (xhtml:tr
-              (xhtml:td "License:")
-              (xhtml:td
-               (xhtml:a (@ (href "../../licenses/MIT")) "MIT")))
-             (xhtml:tr
-              (xhtml:td "URL:")
-              (xhtml:td
-               (xhtml:a (@ (href "http://gnu.org/s/my-example-sxml";))
-                        "http://gnu.org/s/my-example-sxml";)
-               ", "
-               (xhtml:a (@ (href "http://alternative/home/page";))
-                        "http://alternative/home/page";))))
-            (xhtml:h4 "Downloads:")
-            (xhtml:table
-             (@ (summary "Package my-example-sxml downloads"))
-             (xhtml:tr
-              (xhtml:td " Reference manual: ")
-              (xhtml:td
-               (xhtml:a (@ (href "my-example-sxml.pdf"))
-                        " my-example-sxml.pdf ")))
-             (xhtml:tr
-              (xhtml:td " Package source: ")
-              (xhtml:td
-               (xhtml:a
-                (@ (href "../../../src/contrib/my-example-sxml_1.2.3.tar.gz"))
-                " my-example-sxml_1.2.3.tar.gz "))))
-            (xhtml:h4 "Reverse dependencies:")
-            (xhtml:table
-             (@ (summary "Package my-example-sxml reverse dependencies"))
-             (xhtml:tr
-              (xhtml:td "Reverse depends:")
-              (xhtml:td "Too many."))
-             (xhtml:tr
-              (xhtml:td "Reverse imports:")
-              (xhtml:td "Likewise."))
-             (xhtml:tr
-              (xhtml:td "Reverse suggests:")
-              (xhtml:td "Uncountable.")))))))
+(define description "
+Package: My-Example
+Type: Package
+Title: Example package
+Version: 1.2.3
+Date: 2015-12-10
+Author: Ricardo Wurmus
+Maintainer: Guix Schmeeks <address@hidden>
+URL: http://gnu.org/s/my-example
+Description: This is a long description
+spanning multiple lines: and it could confuse the parser that
+there is a colon : on the lines.
+  And: this line continues the description.
+biocViews: 0
+SystemRequirements: Cairo (>= 0)
+Depends: A C++11 compiler. Version 4.6.* of g++ (as
+       currently in Rtools) is insufficient; versions 4.8.*, 4.9.* or
+       later will be fine.
+License: GPL (>= 3)
+Imports: Rcpp (>= 0.11.5), proto, Scales
+LinkingTo: Rcpp, BH
+NeedsCompilation: yes
+Repository: CRAN
+Date/Publication: 2015-07-14 14:15:16
+")
 
-(define simple-table
-  '(xhtml:table
-    (xhtml:tr
-     (xhtml:td "Numbers")
-     (xhtml:td "123"))
-    (xhtml:tr
-     (@ (class "whatever"))
-     (xhtml:td (@ (class "unimportant")) "Letters")
-     (xhtml:td "abc"))
-    (xhtml:tr
-     (xhtml:td "Letters")
-     (xhtml:td "xyz"))
-    (xhtml:tr
-     (xhtml:td "Single"))
-    (xhtml:tr
-     (xhtml:td "not a value")
-     (xhtml:td "not a label")
-     (xhtml:td "also not a label"))))
+(define description-alist
+  ((@@ (guix import cran) description->alist) description))
+
+(define simple-alist
+  '(("Key"        . "Value")
+    ("SimpleList" . "R, Rcpp, something, whatever")
+    ("BadList"    . "This is not a real list, you know?")
+    ("List"       . "R (>= 2.2), BH (for no reason), GenomicRanges")))
 
 (test-begin "cran")
 
-(test-equal "table-datum: return list of first table cell matching label"
-  '((xhtml:td "abc"))
-  ((@@ (guix import cran) table-datum) simple-table "Letters"))
+(test-assert "description->alist: contains all valid keys"
+  (let ((keys '("Package" "Type" "Title" "Version" "Date"
+                "Author" "Maintainer" "URL" "Description"
+                "SystemRequirements" "Depends" "License"
+                "Imports" "biocViews" "LinkingTo"
+                "NeedsCompilation" "Repository"
+                "Date/Publication")))
+    (lset= string=? keys (map car description-alist))))
 
-(test-equal "table-datum: return empty list if no match"
+(test-equal "listify: return empty list if key cannot be found"
   '()
-  ((@@ (guix import cran) table-datum) simple-table "Astronauts"))
+  ((@@ (guix import cran) listify) simple-alist "Letters"))
+
+(test-equal "listify: split comma-separated value into elements"
+  '("R" "Rcpp" "something" "whatever")
+  ((@@ (guix import cran) listify) simple-alist "SimpleList"))
 
-(test-equal "table-datum: only consider the first cell as a label cell"
+(test-equal "listify: strip off parentheses"
+  '("R" "BH" "GenomicRanges")
+  ((@@ (guix import cran) listify) simple-alist "List"))
+
+(test-equal "listify: ignore values that are no lists"
   '()
-  ((@@ (guix import cran) table-datum) simple-table "not a label"))
+  ((@@ (guix import cran) listify) simple-alist "BadList"))
+
+(test-equal "beautify-description: use double spacing"
+  "This is a package.  It is great.  Trust me Mr.  Hendrix."
+  ((@@ (guix import cran) beautify-description)
+   "This is a package. It is great. Trust me Mr. Hendrix."))
 
+(test-equal "beautify-description: transform fragment into sentence"
+  "This package provides a function to establish world peace"
+  ((@@ (guix import cran) beautify-description)
+   "A function to establish world peace"))
 
-(test-assert "cran-sxml->sexp"
+(test-assert "description->package"
   ;; Replace network resources with sample data.
   (mock ((guix build download) url-fetch
          (lambda* (url file-name #:key (mirrors '()))
@@ -140,32 +104,37 @@
              (lambda ()
                (display
                 (match url
-                  ("mirror://cran/src/contrib/my-example-sxml_1.2.3.tar.gz"
+                  ("mirror://cran/src/contrib/My-Example_1.2.3.tar.gz"
                    "source")
                   (_ (error "Unexpected URL: " url))))))))
-    (match ((@@ (guix import cran) cran-sxml->sexp) sxml)
+    (match ((@@ (guix import cran) description->package) description-alist)
       (('package
-         ('name "r-my-example-sxml")
+         ('name "r-my-example")
          ('version "1.2.3")
          ('source ('origin
                     ('method 'url-fetch)
-                    ('uri ('cran-uri "my-example-sxml" 'version))
+                    ('uri ('cran-uri "My-Example" 'version))
                     ('sha256
                      ('base32
                       (? string? hash)))))
+         ('properties ('quasiquote (('upstream-name . "My-Example"))))
          ('build-system 'r-build-system)
          ('inputs
           ('quasiquote
            (("cairo" ('unquote 'cairo)))))
          ('propagated-inputs
           ('quasiquote
-           (("r-proto" ('unquote 'r-proto))
+           (("r-bh" ('unquote 'r-bh))
+            ("r-proto" ('unquote 'r-proto))
             ("r-rcpp" ('unquote 'r-rcpp))
             ("r-scales" ('unquote 'r-scales)))))
-         ('home-page "http://gnu.org/s/my-example-sxml";)
-         ('synopsis "Short description")
-         ('description "Long description")
-         ('license 'x11)))
+         ('home-page "http://gnu.org/s/my-example";)
+         ('synopsis "Example package")
+         ('description
+          "This is a long description spanning multiple lines: \
+and it could confuse the parser that there is a colon : on the \
+lines.  And: this line continues the description.")
+         ('license 'gpl3+)))
       (x
        (begin
          (format #t "~s\n" x)
-- 
2.1.0

~~ Ricardo

reply via email to

[Prev in Thread] Current Thread [Next in Thread]